benvanik · June 15, 2022 19:41
diff --git a/launch.json b/launch.json
 {
            "name": "(gdb) iree-compile",
            "type": "cppdbg",
            "request": "launch",
            "preLaunchTask": "build-iree-compile",
            "program": "${command:cmake.buildDirectory}/tools/iree-compile",
            "args": [
                // "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
                "-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
                "-iree-vm-emit-polyglot-zip=true",
                // "-mlir-elide-elementsattrs-if-larger=8192",
                "-mlir-disable-threading",
                // "-mlir-print-ir-before-all",
                // "-mlir-print-ir-after-all",
                // "-iree-hal-dump-executable-sources-to=${workspaceFolder}/../iree-tmp/executables/",
                "-iree-hal-target-backends=dylib-llvm-aot",
                "-iree-llvm-target-triple=x86_64-pc-linux-elf",
                "-iree-llvm-link-embedded",
                // "-iree-llvm-keep-linker-artifacts",

                // "-iree-input-type=tosa",
                "-iree-input-type=mhlo",

                "${workspaceFolder}/runtime/src/iree/runtime/testdata/simple_mul.mlir",
                "-o=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",

                // "-iree-input-type=mhlo",
                // "${workspaceFolder}/iree/test/e2e/models/mobilenetv3_fake_weights.mlir",
                // "-o=${workspaceFolder}/../iree-tmp/mobilenetv3_fake_weights.vmfb",
                // "${workspaceFolder}/iree/test/e2e/models/unidirectional_lstm.mlir",
                // "-o=${workspaceFolder}/../iree-tmp/unidirectional_lstm.vmfb",

                ">",
                "${workspaceFolder}/../iree-tmp/iree-compile-out.txt",
                "2>&1"
            ],
            "stopAtEntry": false,
            "cwd": "${workspaceFolder}",
            // "internalConsoleOptions": "openOnSessionStart",
            "externalConsole": false,
            "MIMode": "gdb",
            "setupCommands": [
                {
                    "description": "Enable pretty-printing for gdb",
                    "text": "-enable-pretty-printing",
                    "ignoreFailures": true
                }
            ],
            // "visualizerFile": "${workspaceFolder}/iree.natvis"
        },
        {
            "name": "(lldb) iree-run-module",
            "type": "cppdbg",
            // "type": "lldb",
            "request": "launch",
            "preLaunchTask": "build-iree-run-module",
            "program": "${command:cmake.buildDirectory}/tools/iree-run-module",
            "args": [
                "--trace_execution",

                "--device=local-sync",

                "--module_file=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",
                "--entry_function=simple_mul",
                "--function_input=4xf32=12",
                "--function_input=4xf32=2",

                ">",
                "${workspaceFolder}/../iree-tmp/iree-run-module-out.txt",
                "2>&1"
            ],
            "stopAtEntry": false,
            "cwd": "${workspaceFolder}",
            "environment": [
                {
                    "name": "TRACY_NO_EXIT",
                    "value": "1",
                }
            ],
            // "internalConsoleOptions": "openOnSessionStart",
            "externalConsole": false,
            "MIMode": "gdb",
            // "MIMode": "lldb",
            // "miDebuggerPath": "/usr/bin/lldb",
            "setupCommands": [
                {
                    "description": "Enable pretty-printing for gdb",
                    "text": "-enable-pretty-printing",
                    "ignoreFailures": true
                }
            ],
            "visualizerFile": "${workspaceFolder}/iree.natvis"
        },
diff --git a/simple_mul.mlir b/simple_mul.mlir
 // -----// IR Dump After TopLevelSCFToCFG //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After MHLOToMHLOPreprocessing //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After ShapeToShapeLowering //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After ConvertShapeToStandard //----- //
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
    return %0 : tensor<4xf32>
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After Inliner //----- //
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
    return %0 : tensor<4xf32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteI64ToI32Pass //----- //
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
    return %0 : tensor<4xf32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
    return %0 : tensor<4xf32>
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After ConvertMHLOToLinalgExt //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = mhlo.multiply %arg0, %arg1 {name = "mul.1"} : tensor<4xf32>
  return %0 : tensor<4xf32>
 }

 // -----// IR Dump After ConvertMHLOToLinalgOnTensors //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = linalg.init_tensor [4] : tensor<4xf32>
  %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %2 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %2 : f32
  } -> tensor<4xf32>
  return %1 : tensor<4xf32>
 }

 // -----// IR Dump After ReconcileUnrealizedCasts //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = linalg.init_tensor [4] : tensor<4xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %2 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4xf32>
    return %1 : tensor<4xf32>
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = linalg.init_tensor [4] : tensor<4xf32>
  %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %2 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %2 : f32
  } -> tensor<4xf32>
  return %1 : tensor<4xf32>
 }

 // -----// IR Dump After VerifyCompilerMHLOInputLegality //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = linalg.init_tensor [4] : tensor<4xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %2 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4xf32>
    return %1 : tensor<4xf32>
  }
 }


 // -----// IR Dump After IREEImportPublic //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = linalg.init_tensor [4] : tensor<4xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %2 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4xf32>
    return %1 : tensor<4xf32>
  }
 }


 // -----// IR Dump After SanitizeModuleNames //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = linalg.init_tensor [4] : tensor<4xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %2 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4xf32>
    return %1 : tensor<4xf32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = call @_simple_mul(%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  func.func private @_simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
    %0 = linalg.init_tensor [4] : tensor<4xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %2 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4xf32>
    return %1 : tensor<4xf32>
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func private @_simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
  %0 = linalg.init_tensor [4] : tensor<4xf32>
  %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<4xf32>, tensor<4xf32>) outs(%0 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %2 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %2 : f32
  } -> tensor<4xf32>
  return %1 : tensor<4xf32>
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = call @_simple_mul(%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::DemoteF64ToF32Pass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertConv2D1x1ConvToMatmul //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After VerifyInputLegality //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversion //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ExpandTensorShapes //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module attributes {iree.fixedpoint.iteration = 0 : index} {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FixedPointIteratorPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = linalg.init_tensor [4] : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
      %5 = arith.mulf %arg2, %arg3 : f32
      linalg.yield %5 : f32
    } -> tensor<4xf32>
    %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
    return %4 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PadTensorToSubTensorInsert //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertElementwiseToLinalg //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgFoldUnitExtentDims //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After InterchangeGenericOps //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ResolveShapedTypeResultDims //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After FusionOfTensorOps //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReduction //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After InterchangeGenericOps //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = linalg.init_tensor [4] : tensor<4xf32>
  %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
    %5 = arith.mulf %arg2, %arg3 : f32
    linalg.yield %5 : f32
  } -> tensor<4xf32>
  %4 = hal.tensor.export %3 : tensor<4xf32> -> !hal.buffer_view
  return %4 : !hal.buffer_view
 }

 // -----// IR Dump After DispatchLinalgOnTensors //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %6 = linalg.init_tensor [4] : tensor<4xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
      %8 = arith.mulf %arg5, %arg6 : f32
      linalg.yield %8 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    flow.return
  }
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CaptureDispatchDynamicDims //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %6 = linalg.init_tensor [4] : tensor<4xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
      %8 = arith.mulf %arg5, %arg6 : f32
      linalg.yield %8 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    flow.return
  }
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %6 = linalg.init_tensor [4] : tensor<4xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
      %8 = arith.mulf %arg5, %arg6 : f32
      linalg.yield %8 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    flow.return
  }
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
      (%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
    %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %6 = linalg.init_tensor [4] : tensor<4xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
      %8 = arith.mulf %arg5, %arg6 : f32
      linalg.yield %8 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    flow.return
  }
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After InitializeEmptyTensors //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch.workgroups[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> =
        (%arg2: !flow.dispatch.tensor<readonly:4xf32>, %arg3: !flow.dispatch.tensor<readonly:4xf32>, %arg4: !flow.dispatch.tensor<writeonly:4xf32>) {
      %4 = flow.dispatch.tensor.load %arg2, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %5 = flow.dispatch.tensor.load %arg3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %6 = linalg.init_tensor [4] : tensor<4xf32>
      %7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%4, %5 : tensor<4xf32>, tensor<4xf32>) outs(%6 : tensor<4xf32>) attrs =  {name = "mul.1"} {
      ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
        %8 = arith.mulf %arg5, %arg6 : f32
        linalg.yield %8 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %7, %arg4, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      flow.return
    }
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OutlineDispatchRegions //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::StripDebugOpsPass //----- //
 flow.executable private @simple_mul_dispatch_0 {
  flow.executable.export public @simple_mul_dispatch_0 
  builtin.module {
    func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %2 = linalg.init_tensor [4] : tensor<4xf32>
      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
        %4 = arith.mulf %arg3, %arg4 : f32
        linalg.yield %4 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      return
    }
  }
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After DeduplicateExecutables //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 flow.executable private @simple_mul_dispatch_0 {
  flow.executable.export public @simple_mul_dispatch_0 
  builtin.module {
    func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %2 = linalg.init_tensor [4] : tensor<4xf32>
      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
        %4 = arith.mulf %arg3, %arg4 : f32
        linalg.yield %4 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      return
    }
  }
 }

 // -----// IR Dump After CSE //----- //
 flow.executable private @simple_mul_dispatch_0 {
  flow.executable.export public @simple_mul_dispatch_0 
  builtin.module {
    func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %2 = linalg.init_tensor [4] : tensor<4xf32>
      %3 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
        %4 = arith.mulf %arg3, %arg4 : f32
        linalg.yield %4 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      return
    }
  }
 }

 // -----// IR Dump After CleanupTensorShapes //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyInput //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OutlineConstants //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
  %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
  %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
  %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  flow.executable private @simple_mul_dispatch_0 {
    flow.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:4xf32>, %arg1: !flow.dispatch.tensor<readonly:4xf32>, %arg2: !flow.dispatch.tensor<writeonly:4xf32>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %2 = linalg.init_tensor [4] : tensor<4xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%0, %1 : tensor<4xf32>, tensor<4xf32>) outs(%2 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %4 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %4 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
    %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
    %2 = flow.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
    %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertToStream //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c4_0 = arith.constant 4 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4_0]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %c553648160_i32_1 = arith.constant 553648160 : i32
    %c1_i32_2 = arith.constant 1 : i32
    %c4_3 = arith.constant 4 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4_3]) type(%c553648160_i32_1) encoding(%c1_i32_2)
    %3 = stream.tensor.sizeof tensor<4xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.tensor.sizeof tensor<4xf32> : index
    %7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %9 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToTensors //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %c4_0 = arith.constant 4 : index
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4_0]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %c553648160_i32_1 = arith.constant 553648160 : i32
    %c1_i32_2 = arith.constant 1 : i32
    %c4_3 = arith.constant 4 : index
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4_3]) type(%c553648160_i32_1) encoding(%c1_i32_2)
    %3 = stream.tensor.sizeof tensor<4xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.tensor.sizeof tensor<4xf32> : index
    %7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %9 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.sizeof tensor<4xf32> : index
    %4 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%3}
    %5 = stream.async.transfer %4 : !stream.resource<external>{%3} -> !stream.resource<*>{%3}
    %6 = stream.tensor.sizeof tensor<4xf32> : index
    %7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %5) : (!stream.resource<*>{%0}, !stream.resource<*>{%3}) -> !stream.resource<*>{%6}
    %8 = stream.async.transfer %7 : !stream.resource<*>{%6} -> !stream.resource<external>{%6}
    %9 = stream.tensor.export %8 : tensor<4xf32> in !stream.resource<external>{%6} -> !hal.buffer_view
    return %9 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
    %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.sizeof tensor<4xf32> : index
  %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
  %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
  %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
  %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
  %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
  %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
  return %7 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
    %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
    %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
    %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.sizeof tensor<4xf32> : index
    %1 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %2 = stream.async.transfer %1 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %3 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%0}
    %4 = stream.async.transfer %3 : !stream.resource<external>{%0} -> !stream.resource<*>{%0}
    %5 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%2, %4) : (!stream.resource<*>{%0}, !stream.resource<*>{%0}) -> !stream.resource<*>{%0}
    %6 = stream.async.transfer %5 : !stream.resource<*>{%0} -> !stream.resource<external>{%0}
    %7 = stream.tensor.export %6 : tensor<4xf32> in !stream.resource<external>{%0} -> !hal.buffer_view
    return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After EncodeDeviceTensors //----- //
 stream.executable private @simple_mul_dispatch_0 {
  stream.executable.export public @simple_mul_dispatch_0 
  builtin.module {
    func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %5 = linalg.init_tensor [4] : tensor<4xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
        %7 = arith.mulf %arg3, %arg4 : f32
        linalg.yield %7 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      return
    }
  }
 }

 // -----// IR Dump After EncodeHostTensors //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
  %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %6 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeBuiltins //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
  %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %6 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeCopyOnWrite //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
  %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
  %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
  %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %6 : !hal.buffer_view
 }

 // -----// IR Dump After ElideAsyncCopies //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %1 = stream.async.transfer %0 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %2 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} -> !stream.resource<*>{%c16}
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%1, %3) : (!stream.resource<*>{%c16}, !stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} -> !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After RefineUsage //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%0, %1) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ScheduleExecution //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
    stream.yield %4 : !stream.resource<external>{%c16}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
  %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After ScheduleConcurrency //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
    stream.yield %4 : !stream.resource<external>{%c16}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
  %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateTimepoints //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.timepoint.immediate => !stream.timepoint
    %3 = stream.timepoint.immediate => !stream.timepoint
    %4 = stream.timepoint.immediate => !stream.timepoint
    %results, %result_timepoint = stream.async.execute await(%4) => with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %7 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %7 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %5 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %6 = stream.tensor.export %5 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %6 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
    %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
    stream.yield %4 : !stream.resource<external>{%c16}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
  %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %3 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToAsync //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %results, %result_timepoint = stream.async.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16} {
      %4 = stream.async.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%arg2, %arg3) : (!stream.resource<external>{%c16}, !stream.resource<external>{%c16}) -> !stream.resource<external>{%c16}
      stream.yield %4 : !stream.resource<external>{%c16}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c16}
    %3 = stream.tensor.export %2 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ScheduleAllocation //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %c0 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After PackConstants //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %c0 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After PackAllocations //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %c0 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After LayoutSlices //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %c0 = arith.constant 0 : index
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateSubviews //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %c0 = arith.constant 0 : index
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToCmd //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseDispatchBindings //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
        %c0 = arith.constant 0 : index
        %0 = arith.addi %c0, %arg3 : index
        %1 = stream.binding.subspan %arg0[%0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = arith.addi %c0, %arg4 : index
        %3 = stream.binding.subspan %arg1[%2] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %4 = arith.addi %c0, %arg5 : index
        %5 = stream.binding.subspan %arg2[%4] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %7 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %8 = linalg.init_tensor [4] : tensor<4xf32>
        %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%8 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
          %10 = arith.mulf %arg6, %arg7 : f32
          linalg.yield %10 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %9, %5, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %c0_0 = arith.constant 0 : index
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0, %c0, %c0 : index, index, index) {
        ro %arg2[%c0_0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0_0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0_0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PackDispatchOperands //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: i32, %arg4: i32, %arg5: i32) {
        %0 = arith.index_cast %arg3 : i32 to index
        %1 = arith.index_cast %arg4 : i32 to index
        %2 = arith.index_cast %arg5 : i32 to index
        %c0 = arith.constant 0 : index
        %3 = arith.addi %c0, %0 : index
        %4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %5 = arith.addi %c0, %1 : index
        %6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %7 = arith.addi %c0, %2 : index
        %8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %11 = linalg.init_tensor [4] : tensor<4xf32>
        %12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
          %13 = arith.mulf %arg6, %arg7 : f32
          linalg.yield %13 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %c0_0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %c0_i32_1 = arith.constant 0 : i32
    %c0_i32_2 = arith.constant 0 : i32
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0_i32, %c0_i32_1, %c0_i32_2 : i32, i32, i32) {
        ro %arg2[%c0_0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0_0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0_0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: i32, %arg4: i32, %arg5: i32) {
        %0 = arith.index_cast %arg3 : i32 to index
        %1 = arith.index_cast %arg4 : i32 to index
        %2 = arith.index_cast %arg5 : i32 to index
        %c0 = arith.constant 0 : index
        %3 = arith.addi %c0, %0 : index
        %4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %5 = arith.addi %c0, %1 : index
        %6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %7 = arith.addi %c0, %2 : index
        %8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %11 = linalg.init_tensor [4] : tensor<4xf32>
        %12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
          %13 = arith.mulf %arg6, %arg7 : f32
          linalg.yield %13 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1](%c0_i32, %c0_i32, %c0_i32 : i32, i32, i32) {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FoldUniformOperands //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %c0 = arith.constant 0 : index
        %3 = arith.addi %c0, %0 : index
        %4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %5 = arith.addi %c0, %1 : index
        %6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %7 = arith.addi %c0, %2 : index
        %8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %11 = linalg.init_tensor [4] : tensor<4xf32>
        %12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %13 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %13 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchArguments //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %0 = arith.index_cast %c0_i32 : i32 to index
        %1 = arith.index_cast %c0_i32 : i32 to index
        %2 = arith.index_cast %c0_i32 : i32 to index
        %c0 = arith.constant 0 : index
        %3 = arith.addi %c0, %0 : index
        %4 = stream.binding.subspan %arg0[%3] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %5 = arith.addi %c0, %1 : index
        %6 = stream.binding.subspan %arg1[%5] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %7 = arith.addi %c0, %2 : index
        %8 = stream.binding.subspan %arg2[%7] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %9 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %11 = linalg.init_tensor [4] : tensor<4xf32>
        %12 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %13 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %13 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %12, %8, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %c0 = arith.constant 0 : index
    %c0_i32 = arith.constant 0 : i32
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SymbolDCE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
  %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
  %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
    stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
      ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
      ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
      wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
    }
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
  %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
  return %5 : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #map = affine_map<(d0) -> (d0)>
 module {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::AssignTargetDevicesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #map = affine_map<(d0) -> (d0)>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::VerifyTargetEnvironmentPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #map = affine_map<(d0) -> (d0)>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  stream.executable private @simple_mul_dispatch_0 {
    stream.executable.export public @simple_mul_dispatch_0 
    builtin.module {
      func.func @simple_mul_dispatch_0(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:4xf32>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:4xf32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
        %5 = linalg.init_tensor [4] : tensor<4xf32>
        %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
        ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
          %7 = arith.mulf %arg3, %arg4 : f32
          linalg.yield %7 : f32
        } -> tensor<4xf32>
        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
        return
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      }
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::MaterializeInterfacesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<(d0) -> (d0)>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout)
      builtin.module {
        func.func @simple_mul_dispatch_0() {
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
          %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
          %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
          %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
          %5 = linalg.init_tensor [4] : tensor<4xf32>
          %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
          ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
            %7 = arith.mulf %arg0, %arg1 : f32
            linalg.yield %7 : f32
          } -> tensor<4xf32>
          flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
          return
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %1 = stream.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
    %2 = stream.resource.alloc uninitialized : !stream.resource<external>{%c16}
    %3 = stream.cmd.execute with(%0 as %arg2: !stream.resource<external>{%c16}, %1 as %arg3: !stream.resource<external>{%c16}, %2 as %arg4: !stream.resource<external>{%c16}) {
      stream.cmd.dispatch @simple_mul_dispatch_0::@simple_mul_dispatch_0[%c4, %c1, %c1] {
        ro %arg2[%c0 for %c16] : !stream.resource<external>{%c16},
        ro %arg3[%c0 for %c16] : !stream.resource<external>{%c16},
        wo %arg4[%c0 for %c16] : !stream.resource<external>{%c16}
      } attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %2 : !stream.resource<external>{%c16}
    %5 = stream.tensor.export %4 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
    return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLinalgTransformLegality //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = linalg.init_tensor [4] : tensor<4xf32>
    %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
      %7 = arith.mulf %arg0, %arg1 : f32
      linalg.yield %7 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    return
  }
 }

 // -----// IR Dump After TypePropagation //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
  %5 = linalg.init_tensor [4] : tensor<4xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
  ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
    %7 = arith.mulf %arg0, %arg1 : f32
    linalg.yield %7 : f32
  } -> tensor<4xf32>
  flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatches //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %5 = linalg.init_tensor [4] : tensor<4xf32>
    %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {name = "mul.1"} {
    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
      %7 = arith.mulf %arg0, %arg1 : f32
      linalg.yield %7 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    return
  }
 }

 // -----// IR Dump After InsertDistributionInfo //----- //
 hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
  hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
    hal.return %0, %c1, %c1 : index, index, index
  }
  builtin.module {
    func.func @simple_mul_dispatch_0() {
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
      %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %5 = linalg.init_tensor [4] : tensor<4xf32>
      %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
        %7 = arith.mulf %arg0, %arg1 : f32
        linalg.yield %7 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
      return
    }
  }
 }

 // -----// IR Dump After TileAndDistributeToWorkgroups //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = linalg.init_tensor [4] : tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%5, %6 : tensor<4xf32>, tensor<4xf32>) outs(%7 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After ConvertToDestinationPassingStyle //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %c1_0 = arith.constant 1 : index
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [%c1_0] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.init_tensor [4] : tensor<4xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %10 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %10 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %9, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After FoldAffineMinInDistributedLoops //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [%c1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
    scf.for %arg0 = %3 to %c4 step %4 {
      %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
      %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
      ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
        %9 = arith.mulf %arg1, %arg2 : f32
        linalg.yield %9 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    }
    return
  }
 }

 // -----// IR Dump After CSE //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
    scf.for %arg0 = %3 to %c4 step %4 {
      %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
      %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
      ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
        %9 = arith.mulf %arg1, %arg2 : f32
        linalg.yield %9 : f32
      } -> tensor<4xf32>
      flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    }
    return
  }
 }

 // -----// IR Dump After LinalgStrategyTileAndFusePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = scf.for %arg1 = %c0 to %c4 step %c4 iter_args(%arg2 = %5) -> (tensor<4xf32>) {
      %9 = tensor.extract_slice %6[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
      %10 = tensor.extract_slice %7[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
      %11 = tensor.extract_slice %arg2[%arg1] [4] [1] : tensor<4xf32> to tensor<4xf32>
      %12 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%9, %10 : tensor<4xf32>, tensor<4xf32>) outs(%11 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
        %14 = arith.mulf %arg3, %arg4 : f32
        linalg.yield %14 : f32
      } -> tensor<4xf32>
      %13 = tensor.insert_slice %12 into %arg2[%arg1] [4] [1] : tensor<4xf32> into tensor<4xf32>
      scf.yield %13 : tensor<4xf32>
    }
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgFuse //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyTilePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>, name = "mul.1"} {
    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
      %9 = arith.mulf %arg1, %arg2 : f32
      linalg.yield %9 : f32
    } -> tensor<4xf32>
    flow.dispatch.tensor.store %8, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyVectorizePass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgSingleTilingExpert //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
    %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
    %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
    flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
  }
  return
 }

 // -----// IR Dump After LinalgInitTensorToAllocTensor //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
    %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
    scf.for %arg0 = %3 to %c4 step %4 {
      %5 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<writeonly:4xf32> -> tensor<4xf32>
      %6 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %7 = flow.dispatch.tensor.load %1, offsets = [%arg0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<4xf32>
      %8 = vector.transfer_read %6[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
      %9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
      %10 = arith.mulf %8, %9 : vector<4xf32>
      %11 = vector.transfer_write %10, %5[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
      flow.dispatch.tensor.store %11, %2, offsets = [%arg0], sizes = [4], strides = [%c1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
    }
    return
  }
 }

 // -----// IR Dump After IREEComprehensiveBufferize //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %0, 64 : memref<4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %2, 64 : memref<4xf32>
    %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %4, 64 : memref<4xf32>
    %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
    scf.for %arg0 = %6 to %c4 step %7 {
      %8 = bufferization.to_tensor %4 : memref<4xf32>
      %9 = bufferization.to_tensor %0 : memref<4xf32>
      %10 = bufferization.to_tensor %2 : memref<4xf32>
      %11 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
      %12 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
      %13 = arith.mulf %11, %12 : vector<4xf32>
      vector.transfer_write %13, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
      %14 = bufferization.to_tensor %4 : memref<4xf32>
      linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : memref<4xf32>) outs(%4 : memref<4xf32>) {
      ^bb0(%arg1: f32, %arg2: f32):
        linalg.yield %arg1 : f32
      }
    }
    return
  }
 }

 // -----// IR Dump After ResolveShapedTypeResultDims //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %cst = arith.constant 0.000000e+00 : f32
    %c4 = arith.constant 4 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %0, 64 : memref<4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %2, 64 : memref<4xf32>
    %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
    %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %4, 64 : memref<4xf32>
    %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
    scf.for %arg0 = %6 to %c4 step %7 {
      %8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
      %9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
      %10 = arith.mulf %8, %9 : vector<4xf32>
      vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
      linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : memref<4xf32>) outs(%4 : memref<4xf32>) {
      ^bb0(%arg1: f32, %arg2: f32):
        linalg.yield %arg1 : f32
      }
    }
    return
  }
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %4, 64 : memref<4xf32>
  %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %6 to %c4 step %7 {
    %8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  }
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %4, 64 : memref<4xf32>
  %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %6 to %c4 step %7 {
    %8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  }
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:4xf32>
  %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %4, 64 : memref<4xf32>
  %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %6 to %c4 step %7 {
    %8 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %9 = vector.transfer_read %2[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %10 = arith.mulf %8, %9 : vector<4xf32>
    vector.transfer_write %10, %4[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  }
  return
 }

 // -----// IR Dump After CleanupBufferAllocView //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c4 = arith.constant 4 : index
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_count_x = hal.interface.workgroup.count[0] : index
  %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
  %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
  scf.for %arg0 = %3 to %c4 step %4 {
    %5 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %6 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
    %7 = arith.mulf %5, %6 : vector<4xf32>
    vector.transfer_write %7, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  }
  return
 }

 // -----// IR Dump After RemoveSingleIterationLoop //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.transfer_read %0[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %4 = vector.transfer_read %1[%c0], %cst {in_bounds = [true]} : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.transfer_write %5, %2[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyLowerVectorsPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyEnablePass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgStrategyRemoveMarkersPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgVectorLowering //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LLVMCPULowerExecutableTarget //----- //
 hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
  hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
    hal.return %0, %c1, %c1 : index, index, index
  }
  builtin.module {
    func.func @simple_mul_dispatch_0() {
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
      memref.assume_alignment %0, 64 : memref<4xf32>
      %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
      memref.assume_alignment %1, 64 : memref<4xf32>
      %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
      memref.assume_alignment %2, 64 : memref<4xf32>
      %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
      %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
      %5 = arith.mulf %3, %4 : vector<4xf32>
      vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
      return
    }
  }
 }

 // -----// IR Dump After LinalgExtToLoops //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After MemrefCopyToLinalgPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LinalgLowerToLoops //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After ArithmeticBufferize //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %0, 64 : memref<4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %1, 64 : memref<4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %2, 64 : memref<4xf32>
    %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
    %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
    %5 = arith.mulf %3, %4 : vector<4xf32>
    vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
    return
  }
 }

 // -----// IR Dump After FoldTensorExtractOp //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %0, 64 : memref<4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %1, 64 : memref<4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %2, 64 : memref<4xf32>
    %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
    %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
    %5 = arith.mulf %3, %4 : vector<4xf32>
    vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
    return
  }
 }

 // -----// IR Dump After PolynomialApproximationPass //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversion //----- //
 module {
  func.func @simple_mul_dispatch_0() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %0, 64 : memref<4xf32>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %1, 64 : memref<4xf32>
    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
    memref.assume_alignment %2, 64 : memref<4xf32>
    %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
    %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
    %5 = arith.mulf %3, %4 : vector<4xf32>
    vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
    return
  }
 }

 // -----// IR Dump After SCFToControlFlow //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After Canonicalizer //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After CSE //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After ArithmeticExpandOps //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After ExpandOps //----- //
 func.func @simple_mul_dispatch_0() {
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %0, 64 : memref<4xf32>
  %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %1, 64 : memref<4xf32>
  %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
  memref.assume_alignment %2, 64 : memref<4xf32>
  %3 = vector.load %0[%c0] : memref<4xf32>, vector<4xf32>
  %4 = vector.load %1[%c0] : memref<4xf32>, vector<4xf32>
  %5 = arith.mulf %3, %4 : vector<4xf32>
  vector.store %5, %2[%c0] : memref<4xf32>, vector<4xf32>
  return
 }

 // -----// IR Dump After ConvertToLLVM //----- //
 module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
    %0 = llvm.mlir.constant(0 : index) : i64
    %1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %3 = llvm.mlir.constant(0 : i64) : i64
    %4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
    %5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %9 = llvm.mlir.constant(0 : index) : i64
    %10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %11 = llvm.mlir.constant(4 : index) : i64
    %12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %15 = llvm.extractvalue %14[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %16 = llvm.mlir.constant(0 : index) : i64
    %17 = llvm.mlir.constant(63 : index) : i64
    %18 = llvm.ptrtoint %15 : !llvm.ptr<f32> to i64
    %19 = llvm.and %18, %17  : i64
    %20 = llvm.icmp "eq" %19, %16 : i64
    "llvm.intr.assume"(%20) : (i1) -> ()
    %21 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %22 = llvm.extractvalue %21[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %23 = llvm.mlir.constant(1 : i64) : i64
    %24 = llvm.getelementptr %22[%23] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %25 = llvm.load %24 : !llvm.ptr<ptr<i8>>
    %26 = llvm.bitcast %25 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %27 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %28 = llvm.insertvalue %26, %27[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %29 = llvm.insertvalue %26, %28[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %30 = llvm.mlir.constant(0 : index) : i64
    %31 = llvm.insertvalue %30, %29[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %32 = llvm.mlir.constant(4 : index) : i64
    %33 = llvm.insertvalue %32, %31[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %34 = llvm.mlir.constant(1 : index) : i64
    %35 = llvm.insertvalue %34, %33[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %36 = llvm.extractvalue %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %37 = llvm.mlir.constant(0 : index) : i64
    %38 = llvm.mlir.constant(63 : index) : i64
    %39 = llvm.ptrtoint %36 : !llvm.ptr<f32> to i64
    %40 = llvm.and %39, %38  : i64
    %41 = llvm.icmp "eq" %40, %37 : i64
    "llvm.intr.assume"(%41) : (i1) -> ()
    %42 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %43 = llvm.extractvalue %42[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %44 = llvm.mlir.constant(2 : i64) : i64
    %45 = llvm.getelementptr %43[%44] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %46 = llvm.load %45 : !llvm.ptr<ptr<i8>>
    %47 = llvm.bitcast %46 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %48 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %49 = llvm.insertvalue %47, %48[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %50 = llvm.insertvalue %47, %49[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %51 = llvm.mlir.constant(0 : index) : i64
    %52 = llvm.insertvalue %51, %50[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %53 = llvm.mlir.constant(4 : index) : i64
    %54 = llvm.insertvalue %53, %52[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %55 = llvm.mlir.constant(1 : index) : i64
    %56 = llvm.insertvalue %55, %54[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %57 = llvm.extractvalue %56[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %58 = llvm.mlir.constant(0 : index) : i64
    %59 = llvm.mlir.constant(63 : index) : i64
    %60 = llvm.ptrtoint %57 : !llvm.ptr<f32> to i64
    %61 = llvm.and %60, %59  : i64
    %62 = llvm.icmp "eq" %61, %58 : i64
    "llvm.intr.assume"(%62) : (i1) -> ()
    %63 = llvm.extractvalue %14[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %64 = llvm.getelementptr %63[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
    %65 = llvm.bitcast %64 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %66 = llvm.load %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %67 = llvm.extractvalue %35[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %68 = llvm.getelementptr %67[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
    %69 = llvm.bitcast %68 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %70 = llvm.load %69 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %71 = llvm.fmul %66, %70  : vector<4xf32>
    %72 = llvm.extractvalue %56[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %73 = llvm.getelementptr %72[%0] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
    %74 = llvm.bitcast %73 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    llvm.store %71, %74 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %75 = llvm.mlir.constant(0 : i32) : i32
    llvm.return %75 : i32
  }
 }

 // -----// IR Dump After ReconcileUnrealizedCasts //----- //
 module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 {
    %0 = llvm.mlir.constant(0 : index) : i64
    %1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %3 = llvm.mlir.constant(0 : i64) : i64
    %4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
    %5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %9 = llvm.mlir.constant(0 : index) : i64
    %10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %11 = llvm.mlir.constant(4 : index) : i64
    %12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(63 : index) : i64
    %17 = llvm.ptrtoint %5 : !llvm.ptr<f32> to i64
    %18 = llvm.and %17, %16  : i64
    %19 = llvm.icmp "eq" %18, %15 : i64
    "llvm.intr.assume"(%19) : (i1) -> ()
    %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %22 = llvm.mlir.constant(1 : i64) : i64
    %23 = llvm.getelementptr %21[%22] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %24 = llvm.load %23 : !llvm.ptr<ptr<i8>>
    %25 = llvm.bitcast %24 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %26 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %29 = llvm.mlir.constant(0 : index) : i64
    %30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %31 = llvm.mlir.constant(4 : index) : i64
    %32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %33 = llvm.mlir.constant(1 : index) : i64
    %34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %35 = llvm.mlir.constant(0 : index) : i64
    %36 = llvm.mlir.constant(63 : index) : i64
    %37 = llvm.ptrtoint %25 : !llvm.ptr<f32> to i64
    %38 = llvm.and %37, %36  : i64
    %39 = llvm.icmp "eq" %38, %35 : i64
    "llvm.intr.assume"(%39) : (i1) -> ()
    %40 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %41 = llvm.extractvalue %40[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %42 = llvm.mlir.constant(2 : i64) : i64
    %43 = llvm.getelementptr %41[%42] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %44 = llvm.load %43 : !llvm.ptr<ptr<i8>>
    %45 = llvm.bitcast %44 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %46 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %47 = llvm.insertvalue %45, %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %48 = llvm.insertvalue %45, %47[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %49 = llvm.mlir.constant(0 : index) : i64
    %50 = llvm.insertvalue %49, %48[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %51 = llvm.mlir.constant(4 : index) : i64
    %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %53 = llvm.mlir.constant(1 : index) : i64
    %54 = llvm.insertvalue %53, %52[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %55 = llvm.mlir.constant(0 : index) : i64
    %56 = llvm.mlir.constant(63 : index) : i64
    %57 = llvm.ptrtoint %45 : !llvm.ptr<f32> to i64
    %58 = llvm.and %57, %56  : i64
    %59 = llvm.icmp "eq" %58, %55 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.bitcast %5 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %61 = llvm.load %60 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %62 = llvm.bitcast %25 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %63 = llvm.load %62 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %64 = llvm.fmul %61, %63  : vector<4xf32>
    %65 = llvm.bitcast %45 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    llvm.store %64, %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %66 = llvm.mlir.constant(0 : i32) : i32
    llvm.return %66 : i32
  }
 }

 // -----// IR Dump After LLVMCPUSynchronizeSymbolVisibility //----- //
 module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
    %0 = llvm.mlir.constant(0 : index) : i64
    %1 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %2 = llvm.extractvalue %1[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %3 = llvm.mlir.constant(0 : i64) : i64
    %4 = llvm.load %2 : !llvm.ptr<ptr<i8>>
    %5 = llvm.bitcast %4 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %6 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %7 = llvm.insertvalue %5, %6[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %8 = llvm.insertvalue %5, %7[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %9 = llvm.mlir.constant(0 : index) : i64
    %10 = llvm.insertvalue %9, %8[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %11 = llvm.mlir.constant(4 : index) : i64
    %12 = llvm.insertvalue %11, %10[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %13 = llvm.mlir.constant(1 : index) : i64
    %14 = llvm.insertvalue %13, %12[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %15 = llvm.mlir.constant(0 : index) : i64
    %16 = llvm.mlir.constant(63 : index) : i64
    %17 = llvm.ptrtoint %5 : !llvm.ptr<f32> to i64
    %18 = llvm.and %17, %16  : i64
    %19 = llvm.icmp "eq" %18, %15 : i64
    "llvm.intr.assume"(%19) : (i1) -> ()
    %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %22 = llvm.mlir.constant(1 : i64) : i64
    %23 = llvm.getelementptr %21[%22] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %24 = llvm.load %23 : !llvm.ptr<ptr<i8>>
    %25 = llvm.bitcast %24 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %26 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %29 = llvm.mlir.constant(0 : index) : i64
    %30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %31 = llvm.mlir.constant(4 : index) : i64
    %32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %33 = llvm.mlir.constant(1 : index) : i64
    %34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %35 = llvm.mlir.constant(0 : index) : i64
    %36 = llvm.mlir.constant(63 : index) : i64
    %37 = llvm.ptrtoint %25 : !llvm.ptr<f32> to i64
    %38 = llvm.and %37, %36  : i64
    %39 = llvm.icmp "eq" %38, %35 : i64
    "llvm.intr.assume"(%39) : (i1) -> ()
    %40 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %41 = llvm.extractvalue %40[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %42 = llvm.mlir.constant(2 : i64) : i64
    %43 = llvm.getelementptr %41[%42] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %44 = llvm.load %43 : !llvm.ptr<ptr<i8>>
    %45 = llvm.bitcast %44 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %46 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %47 = llvm.insertvalue %45, %46[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %48 = llvm.insertvalue %45, %47[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %49 = llvm.mlir.constant(0 : index) : i64
    %50 = llvm.insertvalue %49, %48[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %51 = llvm.mlir.constant(4 : index) : i64
    %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %53 = llvm.mlir.constant(1 : index) : i64
    %54 = llvm.insertvalue %53, %52[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
    %55 = llvm.mlir.constant(0 : index) : i64
    %56 = llvm.mlir.constant(63 : index) : i64
    %57 = llvm.ptrtoint %45 : !llvm.ptr<f32> to i64
    %58 = llvm.and %57, %56  : i64
    %59 = llvm.icmp "eq" %58, %55 : i64
    "llvm.intr.assume"(%59) : (i1) -> ()
    %60 = llvm.bitcast %5 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %61 = llvm.load %60 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %62 = llvm.bitcast %25 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %63 = llvm.load %62 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %64 = llvm.fmul %61, %63  : vector<4xf32>
    %65 = llvm.bitcast %45 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    llvm.store %64, %65 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %66 = llvm.mlir.constant(0 : i32) : i32
    llvm.return %66 : i32
  }
 }

 // -----// IR Dump After Canonicalizer //----- //
 module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.constant(2 : i64) : i64
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(63 : index) : i64
    %4 = llvm.mlir.constant(0 : index) : i64
    %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
    %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
    %10 = llvm.and %9, %3  : i64
    %11 = llvm.icmp "eq" %10, %4 : i64
    "llvm.intr.assume"(%11) : (i1) -> ()
    %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
    %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
    %18 = llvm.and %17, %3  : i64
    %19 = llvm.icmp "eq" %18, %4 : i64
    "llvm.intr.assume"(%19) : (i1) -> ()
    %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
    %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
    %26 = llvm.and %25, %3  : i64
    %27 = llvm.icmp "eq" %26, %4 : i64
    "llvm.intr.assume"(%27) : (i1) -> ()
    %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %32 = llvm.fmul %29, %31  : vector<4xf32>
    %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    llvm.return %0 : i32
  }
 }

 // -----// IR Dump After CSE //----- //
 module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
  llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
    %0 = llvm.mlir.constant(0 : i32) : i32
    %1 = llvm.mlir.constant(2 : i64) : i64
    %2 = llvm.mlir.constant(1 : i64) : i64
    %3 = llvm.mlir.constant(63 : index) : i64
    %4 = llvm.mlir.constant(0 : index) : i64
    %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
    %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
    %10 = llvm.and %9, %3  : i64
    %11 = llvm.icmp "eq" %10, %4 : i64
    "llvm.intr.assume"(%11) : (i1) -> ()
    %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
    %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
    %18 = llvm.and %17, %3  : i64
    %19 = llvm.icmp "eq" %18, %4 : i64
    "llvm.intr.assume"(%19) : (i1) -> ()
    %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
    %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
    %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
    %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
    %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
    %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
    %26 = llvm.and %25, %3  : i64
    %27 = llvm.icmp "eq" %26, %4 : i64
    "llvm.intr.assume"(%27) : (i1) -> ()
    %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    %32 = llvm.fmul %29, %31  : vector<4xf32>
    %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
    llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
    llvm.return %0 : i32
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateTargetExecutableVariantsPass //----- //
 hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
  hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
    %c1 = arith.constant 1 : index
    %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
    hal.return %0, %c1, %c1 : index, index, index
  }
  builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
    llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
      %0 = llvm.mlir.constant(0 : i32) : i32
      %1 = llvm.mlir.constant(2 : i64) : i64
      %2 = llvm.mlir.constant(1 : i64) : i64
      %3 = llvm.mlir.constant(63 : index) : i64
      %4 = llvm.mlir.constant(0 : index) : i64
      %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
      %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
      %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
      %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
      %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
      %10 = llvm.and %9, %3  : i64
      %11 = llvm.icmp "eq" %10, %4 : i64
      "llvm.intr.assume"(%11) : (i1) -> ()
      %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
      %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
      %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
      %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
      %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
      %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
      %18 = llvm.and %17, %3  : i64
      %19 = llvm.icmp "eq" %18, %4 : i64
      "llvm.intr.assume"(%19) : (i1) -> ()
      %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
      %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
      %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
      %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
      %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
      %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
      %26 = llvm.and %25, %3  : i64
      %27 = llvm.icmp "eq" %26, %4 : i64
      "llvm.intr.assume"(%27) : (i1) -> ()
      %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
      %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
      %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
      %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
      %32 = llvm.fmul %29, %31  : vector<4xf32>
      %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
      llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
      llvm.return %0 : i32
    }
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass //----- //
 hal.executable private @simple_mul_dispatch_0 {
  hal.executable.variant public @embedded_elf_x86_64, target = <"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> {
    hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>} {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
      %c1 = arith.constant 1 : index
      %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%arg1]
      hal.return %0, %c1, %c1 : index, index, index
    }
    builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
      llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
        %0 = llvm.mlir.constant(0 : i32) : i32
        %1 = llvm.mlir.constant(2 : i64) : i64
        %2 = llvm.mlir.constant(1 : i64) : i64
        %3 = llvm.mlir.constant(63 : index) : i64
        %4 = llvm.mlir.constant(0 : index) : i64
        %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
        %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
        %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
        %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
        %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
        %10 = llvm.and %9, %3  : i64
        %11 = llvm.icmp "eq" %10, %4 : i64
        "llvm.intr.assume"(%11) : (i1) -> ()
        %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
        %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
        %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
        %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
        %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
        %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
        %18 = llvm.and %17, %3  : i64
        %19 = llvm.icmp "eq" %18, %4 : i64
        "llvm.intr.assume"(%19) : (i1) -> ()
        %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
        %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
        %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
        %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
        %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
        %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
        %26 = llvm.and %25, %3  : i64
        %27 = llvm.icmp "eq" %26, %4 : i64
        "llvm.intr.assume"(%27) : (i1) -> ()
        %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
        %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
        %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
        %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
        %32 = llvm.fmul %29, %31  : vector<4xf32>
        %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
        llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
        llvm.return %0 : i32
      }
    }
  }
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::(anonymous namespace)::ConvertToHALPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %device_1 = hal.ex.shared_device : !hal.device
    %allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %device_3 = hal.ex.shared_device : !hal.device
    %allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
    %buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %device_6 = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    hal.device.switch<%0 : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%0 : !hal.device) layout(#executable_layout) : !hal.executable_layout
      %c0_11 = arith.constant 0 : index
      %c1_12 = arith.constant 1 : index
      %c2 = arith.constant 2 : index
      %c0_13 = arith.constant 0 : index
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0_13] bindings([
        %c0_11 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1_12 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_5 : !hal.buffer)[%c0, %c16]
      ])
      %c1_14 = arith.constant 1 : index
      %1 = affine.apply #map()[%c4]
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%1, %c1_14, %c1_14])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device_6, %cmd
    %c0_7 = arith.constant 0 : index
    %c4_8 = arith.constant 4 : index
    %c553648160_i32_9 = arith.constant 553648160 : i32
    %c1_i32_10 = arith.constant 1 : i32
    %view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c4_8]) type(%c553648160_i32_9) encoding(%c1_i32_10) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    %device_1 = hal.ex.shared_device : !hal.device
    %allocator_2 = hal.device.allocator<%device_1 : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator_2 : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %device_3 = hal.ex.shared_device : !hal.device
    %allocator_4 = hal.device.allocator<%device_3 : !hal.device> : !hal.allocator
    %buffer_5 = hal.allocator.allocate<%allocator_4 : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %device_6 = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device_6 : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device_6 : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device_6 : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_5 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device_6, %cmd
    %view = hal.buffer_view.create buffer(%buffer_5 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
    %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
    hal.return
  }
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkTargetExecutablesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::LinkExecutablesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@simple_mul_dispatch_0::@embedded_elf_x86_64::@simple_mul_dispatch_0) workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ResolveExportOrdinalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %executable_layout = hal.executable_layout.lookup device(%device : !hal.device) layout(#executable_layout) : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%executable_layout : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
      %exe = hal.executable.lookup device(%0 : !hal.device) executable(@simple_mul_dispatch_0) : !hal.executable
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%exe : !hal.executable)[0] workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::MaterializeResourceCachesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %0 = hal.device.switch<%device : !hal.device> -> !hal.executable
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
      %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
      hal.return %exe : !hal.executable
    },
    #hal.match.always {
      %1 = util.null : !hal.executable
      hal.return %1 : !hal.executable
    }
    util.global.store %0, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    hal.device.switch<%device : !hal.device>
    #hal.device.match.executable.format<"embedded-elf-x86_64"> {
      %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
      hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
        %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
        %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
        %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
      ])
      %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
      %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
      hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
      hal.return
    }
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
 util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.cond_br %value, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb5(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %true = arith.constant true
  cf.cond_br %true, ^bb3, ^bb4
 ^bb3:  // pred: ^bb2
  %0 = util.null : !hal.executable
  cf.br ^bb5(%0 : !hal.executable)
 ^bb4:  // pred: ^bb2
  util.unreachable "device not supported in the compiled configuration"
 ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::InlineDeviceSwitchesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  cf.cond_br %value, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  cf.br ^bb3
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 ^bb3:  // pred: ^bb1
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::MemoizeDeviceQueriesPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0_ok = util.global.load @_device_query_0_ok : i1
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb5(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %true = arith.constant true
    cf.cond_br %true, ^bb3, ^bb4
  ^bb3:  // pred: ^bb2
    %0 = util.null : !hal.executable
    cf.br ^bb5(%0 : !hal.executable)
  ^bb4:  // pred: ^bb2
    util.unreachable "device not supported in the compiled configuration"
  ^bb5(%1: !hal.executable):  // 2 preds: ^bb1, ^bb3
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_device_query_0_ok = util.global.load @_device_query_0_ok : i1
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    cf.br ^bb3
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  ^bb3:  // pred: ^bb1
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After CSE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %ok, @_device_query_0_ok : i1
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @_device_query_0 : i1
  util.global.store %ok, @_device_query_0_ok : i1
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 util.initializer {
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %device = hal.ex.shared_device : !hal.device
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb3(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  cf.br ^bb3(%0 : !hal.executable)
 ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_device_query_0_ok : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.global.store %ok, @_device_query_0_ok : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #map = affine_map<()[s0] -> (s0 ceildiv 4)>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg1]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  util.global.store %value, @_device_query_0 : i1
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
 util.initializer {
  %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  %device = hal.ex.shared_device : !hal.device
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
 util.initializer {
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %device = hal.ex.shared_device : !hal.device
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb3(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  cf.br ^bb3(%0 : !hal.executable)
 ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::ElideRedundantCommandsPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c1_i32 = arith.constant 1 : i32
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After ConvertAffineToStandard //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    util.initializer.return
  }
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.initializer.return
  }
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.initializer {
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    util.initializer.return
  }
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %c1_0 = arith.constant 1 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1_0 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1_0 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::CombineInitializersPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %device_0 = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device_1 = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device_2 = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device_2 : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    cf.br ^bb4
  ^bb4:  // pred: ^bb3
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %c1_0 = arith.constant 1 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1_0 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1_0 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %device_0 = hal.ex.shared_device : !hal.device
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device_0 : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %device_1 = hal.ex.shared_device : !hal.device
    %executable_layout = hal.executable_layout.create device(%device_1 : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %device_2 = hal.ex.shared_device : !hal.device
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device_2 : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After CSE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    util.global.store %value, @_device_query_0 : i1
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %_descriptor_set_layout_0 = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%_descriptor_set_layout_0]) : !hal.executable_layout
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    %_device_query_0 = util.global.load @_device_query_0 : i1
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c1_i32 = arith.constant 1 : i32
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c16 = arith.constant 16 : index
    %c0 = arith.constant 0 : index
    %c2 = arith.constant 2 : index
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
  util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global.store %value, @_device_query_0 : i1
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  cf.cond_br %value, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb3(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  cf.br ^bb3(%0 : !hal.executable)
 ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::SimplifyGlobalAccessesPass //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  %c1_i32 = arith.constant 1 : i32
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c16 = arith.constant 16 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::Util::ApplyPatternsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_descriptor_set_layout_0 : !hal.descriptor_set_layout
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %descriptor_set_layout, @_descriptor_set_layout_0 : !hal.descriptor_set_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FoldGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::Util::(anonymous namespace)::FuseGlobalsPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
 #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
      hal.executable.export public @simple_mul_dispatch_0 ordinal(0) layout(#executable_layout) {translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %c1 = arith.constant 1 : index
        %c4 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %0 = arith.cmpi sle, %arg1, %c0 : index
        %1 = arith.subi %c0, %arg1 : index
        %2 = arith.subi %arg1, %c1 : index
        %3 = arith.select %0, %1, %2 : index
        %4 = arith.divsi %3, %c4 : index
        %5 = arith.subi %c0, %4 : index
        %6 = arith.addi %4, %c1 : index
        %7 = arith.select %0, %5, %6 : index
        hal.return %7, %c1, %c1 : index, index, index
      }
      builtin.module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-unknown-eabi-elf"} {
        llvm.func internal @simple_mul_dispatch_0(%arg0: !llvm.ptr<struct<"iree_hal_executable_environment_v0_t", (ptr<i32>, ptr<func<i32 (ptr<func<i32 (ptr<i8>)>>, ptr<i8>)>>, ptr<ptr<func<i32 (ptr<i8>)>>>, struct<"iree_hal_processor_v0_t", (array<8 x i64>)>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg1: !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>> {llvm.align = 16 : i64, llvm.noalias}, %arg2: !llvm.ptr<struct<"iree_hal_executable_workgroup_state_v0_t", (i32, i32, i16, i16, i32, ptr<ptr<i8>>, i32)>> {llvm.align = 16 : i64, llvm.noalias}) -> i32 attributes {sym_visibility = "private"} {
          %0 = llvm.mlir.constant(0 : i32) : i32
          %1 = llvm.mlir.constant(2 : i64) : i64
          %2 = llvm.mlir.constant(1 : i64) : i64
          %3 = llvm.mlir.constant(63 : index) : i64
          %4 = llvm.mlir.constant(0 : index) : i64
          %5 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %6 = llvm.extractvalue %5[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %7 = llvm.load %6 : !llvm.ptr<ptr<i8>>
          %8 = llvm.bitcast %7 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %9 = llvm.ptrtoint %8 : !llvm.ptr<f32> to i64
          %10 = llvm.and %9, %3  : i64
          %11 = llvm.icmp "eq" %10, %4 : i64
          "llvm.intr.assume"(%11) : (i1) -> ()
          %12 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %13 = llvm.extractvalue %12[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %14 = llvm.getelementptr %13[%2] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %15 = llvm.load %14 : !llvm.ptr<ptr<i8>>
          %16 = llvm.bitcast %15 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %17 = llvm.ptrtoint %16 : !llvm.ptr<f32> to i64
          %18 = llvm.and %17, %3  : i64
          %19 = llvm.icmp "eq" %18, %4 : i64
          "llvm.intr.assume"(%19) : (i1) -> ()
          %20 = llvm.load %arg1 : !llvm.ptr<struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>>
          %21 = llvm.extractvalue %20[10] : !llvm.struct<"iree_hal_executable_dispatch_state_v0_t", (i32, i32, i16, i16, i32, i32, i16, i8, i8, ptr<i32>, ptr<ptr<i8>>, ptr<i64>)>
          %22 = llvm.getelementptr %21[%1] : (!llvm.ptr<ptr<i8>>, i64) -> !llvm.ptr<ptr<i8>>
          %23 = llvm.load %22 : !llvm.ptr<ptr<i8>>
          %24 = llvm.bitcast %23 : !llvm.ptr<i8> to !llvm.ptr<f32>
          %25 = llvm.ptrtoint %24 : !llvm.ptr<f32> to i64
          %26 = llvm.and %25, %3  : i64
          %27 = llvm.icmp "eq" %26, %4 : i64
          "llvm.intr.assume"(%27) : (i1) -> ()
          %28 = llvm.bitcast %8 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %29 = llvm.load %28 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %30 = llvm.bitcast %16 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          %31 = llvm.load %30 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          %32 = llvm.fmul %29, %31  : vector<4xf32>
          %33 = llvm.bitcast %24 : !llvm.ptr<f32> to !llvm.ptr<vector<4xf32>>
          llvm.store %32, %33 {alignment = 4 : i64} : !llvm.ptr<vector<4xf32>>
          llvm.return %0 : i32
        }
      }
    }
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeTargetExecutablesPass //----- //
 hal.executable private @simple_mul_dispatch_0 {
  hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::HAL::SerializeExecutablesPass //----- //
 hal.executable private @simple_mul_dispatch_0 {
  hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
 }

 // -----// IR Dump After SymbolDCE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After LoopInvariantCodeMotion //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
  util.global.store %value, @_device_query_0 : i1
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  cf.cond_br %value, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb3(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  cf.br ^bb3(%0 : !hal.executable)
 ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After SCFToControlFlow //----- //
 util.initializer {
  %device = hal.ex.shared_device : !hal.device
  %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
  %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
  %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
  util.global.store %value, @_device_query_0 : i1
  util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
  cf.cond_br %value, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
  cf.br ^bb3(%exe : !hal.executable)
 ^bb2:  // pred: ^bb0
  %0 = util.null : !hal.executable
  cf.br ^bb3(%0 : !hal.executable)
 ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
  util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer.return
 }

 // -----// IR Dump After LoopCoalescing //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After LoopInvariantCodeMotion //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After SCFToControlFlow //----- //
 func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
  %c2 = arith.constant 2 : index
  %c0 = arith.constant 0 : index
  %c16 = arith.constant 16 : index
  %c4 = arith.constant 4 : index
  %c1 = arith.constant 1 : index
  %c553648160_i32 = arith.constant 553648160 : i32
  %c1_i32 = arith.constant 1 : i32
  %_device_query_0 = util.global.load @_device_query_0 : i1
  %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
  %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
  hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
  %device = hal.ex.shared_device : !hal.device
  %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
  hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
  %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
  hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
  %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
  %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
  hal.command_buffer.begin<%cmd : !hal.command_buffer>
  cf.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
    %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
    %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
    %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
  ])
  hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
  hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
  hal.command_buffer.end<%cmd : !hal.command_buffer>
  hal.ex.submit_and_wait %device, %cmd
  %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
  return %view : !hal.buffer_view
 ^bb2:  // pred: ^bb0
  util.unreachable "device not supported in the compiled configuration"
 }

 // -----// IR Dump After Canonicalizer //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After CSE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu]} {
  util.global private @_device_query_0 : i1
  util.global private @_executable_layout_0 : !hal.executable_layout
  util.global private @_executable_simple_mul_dispatch_0 : !hal.executable
  util.initializer {
    %device = hal.ex.shared_device : !hal.device
    %ok, %value = hal.device.query<%device : !hal.device> key("hal.executable.format" :: "embedded-elf-x86_64") : i1, i1 = false
    %descriptor_set_layout = hal.descriptor_set_layout.create device(%device : !hal.device) usage(push_only) bindings([#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]) : !hal.descriptor_set_layout
    %executable_layout = hal.executable_layout.create device(%device : !hal.device) push_constants(0) layouts([%descriptor_set_layout]) : !hal.executable_layout
    util.global.store %value, @_device_query_0 : i1
    util.global.store %executable_layout, @_executable_layout_0 : !hal.executable_layout
    cf.cond_br %value, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %exe = hal.executable.create device(%device : !hal.device) target(@simple_mul_dispatch_0::@embedded_elf_x86_64) layouts([%_executable_layout_0]) : !hal.executable
    cf.br ^bb3(%exe : !hal.executable)
  ^bb2:  // pred: ^bb0
    %0 = util.null : !hal.executable
    cf.br ^bb3(%0 : !hal.executable)
  ^bb3(%1: !hal.executable):  // 2 preds: ^bb1, ^bb2
    util.global.store %1, @_executable_simple_mul_dispatch_0 : !hal.executable
    util.initializer.return
  }
  hal.executable private @simple_mul_dispatch_0 {
    hal.executable.binary public @embedded_elf_x86_64 attributes {data = dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>, format = "embedded-elf-x86_64", mime_type = "application/x-elf"}
  }
  func.func @simple_mul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
    %c2 = arith.constant 2 : index
    %c0 = arith.constant 0 : index
    %c16 = arith.constant 16 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c553648160_i32 = arith.constant 553648160 : i32
    %c1_i32 = arith.constant 1 : i32
    %_device_query_0 = util.global.load @_device_query_0 : i1
    %_executable_layout_0 = util.global.load @_executable_layout_0 : !hal.executable_layout
    %_executable_simple_mul_dispatch_0 = util.global.load @_executable_simple_mul_dispatch_0 : !hal.executable
    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    hal.buffer.assert<%buffer : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("tensor") shape([%c4]) type(%c553648160_i32) encoding(%c1_i32)
    %buffer_0 = hal.buffer_view.buffer<%arg1 : !hal.buffer_view> : !hal.buffer
    hal.buffer.assert<%buffer_0 : !hal.buffer> message("tensor") allocator(%allocator : !hal.allocator) minimum_length(%c16) type(DeviceVisible) usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage")
    %buffer_1 = hal.allocator.allocate<%allocator : !hal.allocator> type("HostVisible|DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|MappingScoped|MappingAccessRandom|Mapping") : !hal.buffer{%c16}
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories("Transfer|Dispatch") : !hal.command_buffer
    hal.command_buffer.begin<%cmd : !hal.command_buffer>
    cf.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%_executable_layout_0 : !hal.executable_layout)[%c0] bindings([
      %c0 = (%buffer : !hal.buffer)[%c0, %c16], 
      %c1 = (%buffer_0 : !hal.buffer)[%c0, %c16], 
      %c2 = (%buffer_1 : !hal.buffer)[%c0, %c16]
    ])
    hal.command_buffer.dispatch<%cmd : !hal.command_buffer> target(%_executable_simple_mul_dispatch_0 : !hal.executable)[0] workgroups([%c1, %c1, %c1])
    hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
    hal.command_buffer.end<%cmd : !hal.command_buffer>
    hal.ex.submit_and_wait %device, %cmd
    %view = hal.buffer_view.create buffer(%buffer_1 : !hal.buffer) shape([%c4]) type(%c553648160_i32) encoding(%c1_i32) : !hal.buffer_view
    return %view : !hal.buffer_view
  ^bb2:  // pred: ^bb0
    util.unreachable "device not supported in the compiled configuration"
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::VM::ConversionPass //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
  vm.module public @module {
    vm.global.i32 private @_device_query_0 : i32
    vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
    vm.initializer {
      %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %buffer = vm.rodata.inline "_utf8_hal_executable_format_EAB228F999C2D3A1" {alignment = 1 : i64} : !vm.buffer = dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
      %buffer_0 = vm.rodata.inline "_utf8_embedded_elf_x86_64_9FD8733DA4A6F228" {alignment = 1 : i64} : !vm.buffer = dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
      %0:2 = vm.call @hal.device.query.i32(%ref, %buffer, %buffer_0) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
      %c1 = vm.const.i32 1
      %1 = vm.and.i32 %0#1, %c1 : i32
      %zero = vm.const.i32.zero
      %2 = vm.select.i32 %0#0, %1, %zero : i32
      %c1_1 = vm.const.i32 1
      %c1_2 = vm.const.i32 1
      %zero_3 = vm.const.i32.zero
      %c7 = vm.const.i32 7
      %c1_4 = vm.const.i32 1
      %c7_5 = vm.const.i32 7
      %c2 = vm.const.i32 2
      %c7_6 = vm.const.i32 7
      %ref_7 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_2, [(%zero_3, %c7), (%c1_4, %c7_5), (%c2, %c7_6)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %zero_8 = vm.const.i32.zero
      %ref_9 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_8, [%ref_7]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.i32 %2, @_device_query_0 : i32
      vm.global.store.ref %ref_9, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.cond_br %2, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %buffer_10 = vm.rodata.inline "_utf8_embedded_elf_x86_64_9FD8733DA4A6F228" {alignment = 1 : i64} : !vm.buffer = dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
      %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %null = vm.const.ref.zero : !vm.buffer
      %ref_11 = vm.call.variadic @hal.executable.create(%ref, %buffer_10, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_11 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      %null_12 = vm.const.ref.zero : !vm.ref<!hal.executable>
      vm.br ^bb3(%null_12 : !vm.ref<!hal.executable>)
    ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
    vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
      %c2 = vm.const.i32 2
      %zero = vm.const.i32.zero
      %c16 = vm.const.i32 16
      %c4 = vm.const.i32 4
      %c1 = vm.const.i32 1
      %c553648160 = vm.const.i32 553648160
      %c1_0 = vm.const.i32 1
      %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      %buffer = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
      %c4_1 = vm.const.i64 4
      vm.call.variadic @hal.buffer_view.assert(%arg0, %buffer, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %buffer_4 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
      %c16_5 = vm.const.i64 16
      %c16_6 = vm.const.i32 16
      %c3075 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref, %buffer_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %buffer_7 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
      %c4_8 = vm.const.i64 4
      vm.call.variadic @hal.buffer_view.assert(%arg1, %buffer_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %buffer_10 = vm.rodata.inline "_utf8_tensor_3C6209B4FD120BDC" {alignment = 1 : i64} : !vm.buffer = dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
      %c16_11 = vm.const.i64 16
      %c16_12 = vm.const.i32 16
      %c3075_13 = vm.const.i32 3075
      vm.call @hal.buffer.assert(%ref_9, %buffer_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %c50 = vm.const.i32 50
      %c150998019 = vm.const.i32 150998019
      %c16_14 = vm.const.i64 16
      %ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %c17 = vm.const.i32 17
      %c3 = vm.const.i32 3
      %ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.cond_br %_device_query_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %zero_17 = vm.const.i64.zero
      %c16_18 = vm.const.i64 16
      %zero_19 = vm.const.i64.zero
      %c16_20 = vm.const.i64 16
      %zero_21 = vm.const.i64.zero
      %c16_22 = vm.const.i64 16
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      %zero_23 = vm.const.i32.zero
      vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      %c28 = vm.const.i32 28
      %c13 = vm.const.i32 13
      %zero_24 = vm.const.i32.zero
      vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      %c4_25 = vm.const.i64 4
      %ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_26 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      %c2_27 = vm.const.i32 2
      vm.fail %c2_27, "device not supported in the compiled configuration"
    }
    vm.export @simple_mul
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::VM::HoistInlinedRodataPass //----- //
 vm.module public @module {
  vm.global.i32 private @_device_query_0 : i32
  vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
  vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
  vm.initializer {
    %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
    %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
    %c1 = vm.const.i32 1
    %1 = vm.and.i32 %0#1, %c1 : i32
    %zero = vm.const.i32.zero
    %2 = vm.select.i32 %0#0, %1, %zero : i32
    %c1_0 = vm.const.i32 1
    %c1_1 = vm.const.i32 1
    %zero_2 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c1_3 = vm.const.i32 1
    %c7_4 = vm.const.i32 7
    %c2 = vm.const.i32 2
    %c7_5 = vm.const.i32 7
    %ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %zero_7 = vm.const.i32.zero
    %ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
    vm.global.store.i32 %2, @_device_query_0 : i32
    vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.cond_br %2, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0 : !vm.buffer
    %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %null = vm.const.ref.zero : !vm.buffer
    %ref_9 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_0, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb3(%ref_9 : !vm.ref<!hal.executable>)
  ^bb2:  // pred: ^bb0
    %null_10 = vm.const.ref.zero : !vm.ref<!hal.executable>
    vm.br ^bb3(%null_10 : !vm.ref<!hal.executable>)
  ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
    vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.return
  }
  vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
  vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_1 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_2 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC_3 {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
    %c2 = vm.const.i32 2
    %zero = vm.const.i32.zero
    %c16 = vm.const.i32 16
    %c4 = vm.const.i32 4
    %c1 = vm.const.i32 1
    %c553648160 = vm.const.i32 553648160
    %c1_0 = vm.const.i32 1
    %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c4_1 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_3C6209B4FD120BDC_1 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_1 : !vm.buffer
    %c16_4 = vm.const.i64 16
    %c16_5 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_1, %ref_3, %c16_4, %c16_5, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_tensor_3C6209B4FD120BDC_2 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_2 : !vm.buffer
    %c4_6 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_2, %c553648160, %c1_0, [%c4_6]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_7 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_3C6209B4FD120BDC_3 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC_3 : !vm.buffer
    %c16_8 = vm.const.i64 16
    %c16_9 = vm.const.i32 16
    %c3075_10 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_7, %_utf8_tensor_3C6209B4FD120BDC_3, %ref_3, %c16_8, %c16_9, %c3075_10) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %c50 = vm.const.i32 50
    %c150998019 = vm.const.i32 150998019
    %c16_11 = vm.const.i64 16
    %ref_12 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_11) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %c17 = vm.const.i32 17
    %c3 = vm.const.i32 3
    %ref_13 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
    vm.call @hal.command_buffer.begin(%ref_13) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %zero_14 = vm.const.i64.zero
    %c16_15 = vm.const.i64 16
    %zero_16 = vm.const.i64.zero
    %c16_17 = vm.const.i64 16
    %zero_18 = vm.const.i64.zero
    %c16_19 = vm.const.i64 16
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_13, %_executable_layout_0, %zero, [(%zero, %ref, %zero_14, %c16_15), (%c1, %ref_7, %zero_16, %c16_17), (%c2, %ref_12, %zero_18, %c16_19)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_20 = vm.const.i32.zero
    vm.call @hal.command_buffer.dispatch(%ref_13, %_executable_simple_mul_dispatch_0, %zero_20, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_21 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_13, %c28, %c13, %zero_21) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.end(%ref_13) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.call @hal.ex.submit_and_wait(%ref_2, %ref_13) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
    %c4_22 = vm.const.i64 4
    %ref_23 = vm.call.variadic @hal.buffer_view.create(%ref_12, %c553648160, %c1_0, [%c4_22]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_23 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    %c2_24 = vm.const.i32 2
    vm.fail %c2_24, "device not supported in the compiled configuration"
  }
  vm.export @simple_mul
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::VM::DeduplicateRodataPass //----- //
 vm.module public @module {
  vm.global.i32 private @_device_query_0 : i32
  vm.global.ref private @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  vm.global.ref private @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
  vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
  vm.initializer {
    %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
    %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
    %c1 = vm.const.i32 1
    %1 = vm.and.i32 %0#1, %c1 : i32
    %zero = vm.const.i32.zero
    %2 = vm.select.i32 %0#0, %1, %zero : i32
    %c1_0 = vm.const.i32 1
    %c1_1 = vm.const.i32 1
    %zero_2 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c1_3 = vm.const.i32 1
    %c7_4 = vm.const.i32 7
    %c2 = vm.const.i32 2
    %c7_5 = vm.const.i32 7
    %ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %zero_7 = vm.const.i32.zero
    %ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
    vm.global.store.i32 %2, @_device_query_0 : i32
    vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.cond_br %2, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
    %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %null = vm.const.ref.zero : !vm.buffer
    %ref_10 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb3(%ref_10 : !vm.ref<!hal.executable>)
  ^bb2:  // pred: ^bb0
    %null_11 = vm.const.ref.zero : !vm.ref<!hal.executable>
    vm.br ^bb3(%null_11 : !vm.ref<!hal.executable>)
  ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
    vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.return
  }
  vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
  vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
    %c2 = vm.const.i32 2
    %zero = vm.const.i32.zero
    %c16 = vm.const.i32 16
    %c4 = vm.const.i32 4
    %c1 = vm.const.i32 1
    %c553648160 = vm.const.i32 553648160
    %c1_0 = vm.const.i32 1
    %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c4_1 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c16_5 = vm.const.i64 16
    %c16_6 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c4_8 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_3C6209B4FD120BDC_10 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c16_11 = vm.const.i64 16
    %c16_12 = vm.const.i32 16
    %c3075_13 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_9, %_utf8_tensor_3C6209B4FD120BDC_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %c50 = vm.const.i32 50
    %c150998019 = vm.const.i32 150998019
    %c16_14 = vm.const.i64 16
    %ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %c17 = vm.const.i32 17
    %c3 = vm.const.i32 3
    %ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
    vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %zero_17 = vm.const.i64.zero
    %c16_18 = vm.const.i64 16
    %zero_19 = vm.const.i64.zero
    %c16_20 = vm.const.i64 16
    %zero_21 = vm.const.i64.zero
    %c16_22 = vm.const.i64 16
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_23 = vm.const.i32.zero
    vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_24 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
    %c4_25 = vm.const.i64 4
    %ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_26 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    %c2_27 = vm.const.i32 2
    vm.fail %c2_27, "device not supported in the compiled configuration"
  }
  vm.export @simple_mul
 }

 // -----// IR Dump After mlir::iree_compiler::IREE::VM::GlobalInitializationPass //----- //
 vm.module public @module {
  vm.global.i32 private mutable @_device_query_0 : i32
  vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
  vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
  vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
  vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
  vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
  vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
  vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
  vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
  vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
  vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
  vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
  vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
  vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
    %c2 = vm.const.i32 2
    %zero = vm.const.i32.zero
    %c16 = vm.const.i32 16
    %c4 = vm.const.i32 4
    %c1 = vm.const.i32 1
    %c553648160 = vm.const.i32 553648160
    %c1_0 = vm.const.i32 1
    %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c4_1 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1_0, [%c4_1]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
    %_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c16_5 = vm.const.i64 16
    %c16_6 = vm.const.i32 16
    %c3075 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16_5, %c16_6, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c4_8 = vm.const.i64 4
    vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_7, %c553648160, %c1_0, [%c4_8]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
    %ref_9 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
    %_utf8_tensor_3C6209B4FD120BDC_10 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
    %c16_11 = vm.const.i64 16
    %c16_12 = vm.const.i32 16
    %c3075_13 = vm.const.i32 3075
    vm.call @hal.buffer.assert(%ref_9, %_utf8_tensor_3C6209B4FD120BDC_10, %ref_3, %c16_11, %c16_12, %c3075_13) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
    %c50 = vm.const.i32 50
    %c150998019 = vm.const.i32 150998019
    %c16_14 = vm.const.i64 16
    %ref_15 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16_14) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
    %c17 = vm.const.i32 17
    %c3 = vm.const.i32 3
    %ref_16 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
    vm.call @hal.command_buffer.begin(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.cond_br %_device_query_0, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %zero_17 = vm.const.i64.zero
    %c16_18 = vm.const.i64 16
    %zero_19 = vm.const.i64.zero
    %c16_20 = vm.const.i64 16
    %zero_21 = vm.const.i64.zero
    %c16_22 = vm.const.i64 16
    vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_16, %_executable_layout_0, %zero, [(%zero, %ref, %zero_17, %c16_18), (%c1, %ref_9, %zero_19, %c16_20), (%c2, %ref_15, %zero_21, %c16_22)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
    %zero_23 = vm.const.i32.zero
    vm.call @hal.command_buffer.dispatch(%ref_16, %_executable_simple_mul_dispatch_0, %zero_23, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
    %c28 = vm.const.i32 28
    %c13 = vm.const.i32 13
    %zero_24 = vm.const.i32.zero
    vm.call @hal.command_buffer.execution_barrier(%ref_16, %c28, %c13, %zero_24) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
    vm.call @hal.command_buffer.end(%ref_16) : (!vm.ref<!hal.command_buffer>) -> ()
    vm.call @hal.ex.submit_and_wait(%ref_2, %ref_16) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
    %c4_25 = vm.const.i64 4
    %ref_26 = vm.call.variadic @hal.buffer_view.create(%ref_15, %c553648160, %c1_0, [%c4_25]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
    vm.return %ref_26 : !vm.ref<!hal.buffer_view>
  ^bb2:  // pred: ^bb0
    %c2_27 = vm.const.i32 2
    vm.fail %c2_27, "device not supported in the compiled configuration"
  }
  vm.export @simple_mul
  vm.export @__init
  vm.func private @__init() {
    %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
    %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
    %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
    %c1 = vm.const.i32 1
    %1 = vm.and.i32 %0#1, %c1 : i32
    %zero = vm.const.i32.zero
    %2 = vm.select.i32 %0#0, %1, %zero : i32
    %c1_0 = vm.const.i32 1
    %c1_1 = vm.const.i32 1
    %zero_2 = vm.const.i32.zero
    %c7 = vm.const.i32 7
    %c1_3 = vm.const.i32 1
    %c7_4 = vm.const.i32 7
    %c2 = vm.const.i32 2
    %c7_5 = vm.const.i32 7
    %ref_6 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1_1, [(%zero_2, %c7), (%c1_3, %c7_4), (%c2, %c7_5)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
    %zero_7 = vm.const.i32.zero
    %ref_8 = vm.call.variadic @hal.executable_layout.create(%ref, %zero_7, [%ref_6]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
    vm.global.store.i32 %2, @_device_query_0 : i32
    vm.global.store.ref %ref_8, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.cond_br %2, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
    %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
    %null = vm.const.ref.zero : !vm.buffer
    %ref_10 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_9, %simple_mul_dispatch_0_embedded_elf_x86_64, %null, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
    vm.br ^bb3(%ref_10 : !vm.ref<!hal.executable>)
  ^bb2:  // pred: ^bb0
    %null_11 = vm.const.ref.zero : !vm.ref<!hal.executable>
    vm.br ^bb3(%null_11 : !vm.ref<!hal.executable>)
  ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
    vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.br ^bb4
  ^bb4:  // pred: ^bb3
    vm.return
  }
  vm.export @__deinit
  vm.func private @__deinit() {
    vm.return
  }
 }

 // -----// IR Dump After Canonicalizer //----- //
 vm.func private @__deinit() {
  vm.return
 }

 // -----// IR Dump After Canonicalizer //----- //
 vm.func private @__init() {
  %null = vm.const.ref.zero : !vm.ref<!hal.executable>
  %null_0 = vm.const.ref.zero : !vm.buffer
  %c2 = vm.const.i32 2
  %c7 = vm.const.i32 7
  %zero = vm.const.i32.zero
  %c1 = vm.const.i32 1
  %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
  %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
  %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
  %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
  %1 = vm.and.i32 %0#1, %c1 : i32
  %2 = vm.select.i32 %0#0, %1, %zero : i32
  %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
  %ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
  vm.global.store.i32 %2, @_device_query_0 : i32
  vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  vm.cond_br %2, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
  %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
  %ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
  vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
 ^bb2:  // pred: ^bb0
  vm.br ^bb3(%null : !vm.ref<!hal.executable>)
 ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
  vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
  vm.return
 }

 // -----// IR Dump After Canonicalizer //----- //
 vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
  %c13 = vm.const.i32 13
  %c28 = vm.const.i32 28
  %zero = vm.const.i64.zero
  %c3 = vm.const.i32 3
  %c17 = vm.const.i32 17
  %c150998019 = vm.const.i32 150998019
  %c50 = vm.const.i32 50
  %c3075 = vm.const.i32 3075
  %c16 = vm.const.i64 16
  %c4 = vm.const.i64 4
  %c2 = vm.const.i32 2
  %zero_0 = vm.const.i32.zero
  %c16_1 = vm.const.i32 16
  %c1 = vm.const.i32 1
  %c553648160 = vm.const.i32 553648160
  %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
  %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
  %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
  %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
  %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
  %_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
  vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
  vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
  %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
  %_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
  vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
  %ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
  %ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
  vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  vm.cond_br %_device_query_0, ^bb1, ^bb2
 ^bb1:  // pred: ^bb0
  vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
  vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
  vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
  vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
  %ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
  vm.return %ref_10 : !vm.ref<!hal.buffer_view>
 ^bb2:  // pred: ^bb0
  vm.fail %c2, "device not supported in the compiled configuration"
 }

 // -----// IR Dump After Inliner //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
  vm.module public @module {
    vm.global.i32 private mutable @_device_query_0 : i32
    vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
    vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
    vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
    vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
    vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %zero = vm.const.i64.zero
      %c3 = vm.const.i32 3
      %c17 = vm.const.i32 17
      %c150998019 = vm.const.i32 150998019
      %c50 = vm.const.i32 50
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i64 16
      %c4 = vm.const.i64 4
      %c2 = vm.const.i32 2
      %zero_0 = vm.const.i32.zero
      %c16_1 = vm.const.i32 16
      %c1 = vm.const.i32 1
      %c553648160 = vm.const.i32 553648160
      %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.cond_br %_device_query_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_10 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %c2, "device not supported in the compiled configuration"
    }
    vm.export @simple_mul
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %null_0 = vm.const.ref.zero : !vm.buffer
      %c2 = vm.const.i32 2
      %c7 = vm.const.i32 7
      %zero = vm.const.i32.zero
      %c1 = vm.const.i32 1
      %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
      %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
      %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
      %1 = vm.and.i32 %0#1, %c1 : i32
      %2 = vm.select.i32 %0#0, %1, %zero : i32
      %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.i32 %2, @_device_query_0 : i32
      vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.cond_br %2, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
      %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
 }


 // -----// IR Dump After Canonicalizer //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
  vm.module public @module {
    vm.global.i32 private mutable @_device_query_0 : i32
    vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
    vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
    vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %offset : i32, %values : i32 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.bind_descriptor_set(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executable_layout>, %set : i32, %descriptor_set : !vm.ref<!hal.descriptor_set>, %dynamic_offsets : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroup_x : i32, %workgroup_y : i32, %workgroup_z : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.dispatch.indirect(%command_buffer : !vm.ref<!hal.command_buffer>, %executable : !vm.ref<!hal.executable>, %entry_point : i32, %workgroups_buffer : !vm.ref<!hal.buffer>, %workgroups_offset : i64) attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set.create(%device : !vm.ref<!hal.device>, %set_layout : !vm.ref<!hal.descriptor_set_layout>, %bindings : tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...) -> !vm.ref<!hal.descriptor_set> attributes {sym_visibility = "private"}
    vm.import @hal.descriptor_set_layout.create(%device : !vm.ref<!hal.device>, %usage_type : i32, %bindings : tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.allocator(%device : !vm.ref<!hal.device>) -> !vm.ref<!hal.allocator> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.device.query.i32(%device : !vm.ref<!hal.device>, %category : !vm.buffer, %key : !vm.buffer) -> (i32, i32) attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable.create(%device : !vm.ref<!hal.device>, %executable_format : !vm.buffer, %executable_data : !vm.buffer, %constants : !vm.buffer, %executable_layouts : !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.executable_layout.create(%device : !vm.ref<!hal.device>, %push_constants : i32, %set_layouts : !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.create(%device : !vm.ref<!hal.device>, %initial_value : i64) -> !vm.ref<!hal.semaphore> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.semaphore.query(%semaphore : !vm.ref<!hal.semaphore>) -> (i32, i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.signal(%semaphore : !vm.ref<!hal.semaphore>, %new_value : i64) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.fail(%semaphore : !vm.ref<!hal.semaphore>, %status : i32) attributes {sym_visibility = "private"}
    vm.import @hal.semaphore.await(%semaphore : !vm.ref<!hal.semaphore>, %min_value : i64) -> i32 attributes {sym_visibility = "private"}
    vm.rodata private @_utf8_tensor_3C6209B4FD120BDC {alignment = 1 : i64} dense<[116, 101, 110, 115, 111, 114]> : vector<6xi8>
    vm.func private @simple_mul(%arg0: !vm.ref<!hal.buffer_view>, %arg1: !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer_view> {
      %c13 = vm.const.i32 13
      %c28 = vm.const.i32 28
      %zero = vm.const.i64.zero
      %c3 = vm.const.i32 3
      %c17 = vm.const.i32 17
      %c150998019 = vm.const.i32 150998019
      %c50 = vm.const.i32 50
      %c3075 = vm.const.i32 3075
      %c16 = vm.const.i64 16
      %c4 = vm.const.i64 4
      %c2 = vm.const.i32 2
      %zero_0 = vm.const.i32.zero
      %c16_1 = vm.const.i32 16
      %c1 = vm.const.i32 1
      %c553648160 = vm.const.i32 553648160
      %_device_query_0 = vm.global.load.i32 @_device_query_0 : i32
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_executable_simple_mul_dispatch_0 = vm.global.load.ref @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      %_utf8_tensor_3C6209B4FD120BDC = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg0, %_utf8_tensor_3C6209B4FD120BDC, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref = vm.call @hal.buffer_view.buffer(%arg0) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %ref_2 = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %ref_3 = vm.call @hal.device.allocator(%ref_2) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
      %_utf8_tensor_3C6209B4FD120BDC_4 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call @hal.buffer.assert(%ref, %_utf8_tensor_3C6209B4FD120BDC_4, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %_utf8_tensor_3C6209B4FD120BDC_5 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call.variadic @hal.buffer_view.assert(%arg1, %_utf8_tensor_3C6209B4FD120BDC_5, %c553648160, %c1, [%c4]) : (!vm.ref<!hal.buffer_view>, !vm.buffer, i32, i32, i64 ...)
      %ref_6 = vm.call @hal.buffer_view.buffer(%arg1) {nosideeffects} : (!vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer>
      %_utf8_tensor_3C6209B4FD120BDC_7 = vm.const.ref.rodata @_utf8_tensor_3C6209B4FD120BDC : !vm.buffer
      vm.call @hal.buffer.assert(%ref_6, %_utf8_tensor_3C6209B4FD120BDC_7, %ref_3, %c16, %c16_1, %c3075) : (!vm.ref<!hal.buffer>, !vm.buffer, !vm.ref<!hal.allocator>, i64, i32, i32) -> ()
      %ref_8 = vm.call @hal.allocator.allocate(%ref_3, %c50, %c150998019, %c16) : (!vm.ref<!hal.allocator>, i32, i32, i64) -> !vm.ref<!hal.buffer>
      %ref_9 = vm.call @hal.command_buffer.create(%ref_2, %c17, %c3) : (!vm.ref<!hal.device>, i32, i32) -> !vm.ref<!hal.command_buffer>
      vm.call @hal.command_buffer.begin(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.cond_br %_device_query_0, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      vm.call.variadic @hal.command_buffer.push_descriptor_set(%ref_9, %_executable_layout_0, %zero_0, [(%zero_0, %ref, %zero, %c16), (%c1, %ref_6, %zero, %c16), (%c2, %ref_8, %zero, %c16)]) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable_layout>, i32, tuple<i32, !vm.ref<!hal.buffer>, i64, i64> ...)
      vm.call @hal.command_buffer.dispatch(%ref_9, %_executable_simple_mul_dispatch_0, %zero_0, %c1, %c1, %c1) : (!vm.ref<!hal.command_buffer>, !vm.ref<!hal.executable>, i32, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.execution_barrier(%ref_9, %c28, %c13, %zero_0) : (!vm.ref<!hal.command_buffer>, i32, i32, i32) -> ()
      vm.call @hal.command_buffer.end(%ref_9) : (!vm.ref<!hal.command_buffer>) -> ()
      vm.call @hal.ex.submit_and_wait(%ref_2, %ref_9) : (!vm.ref<!hal.device>, !vm.ref<!hal.command_buffer>) -> ()
      %ref_10 = vm.call.variadic @hal.buffer_view.create(%ref_8, %c553648160, %c1, [%c4]) {nosideeffects} : (!vm.ref<!hal.buffer>, i32, i32, i64 ...) -> !vm.ref<!hal.buffer_view>
      vm.return %ref_10 : !vm.ref<!hal.buffer_view>
    ^bb2:  // pred: ^bb0
      vm.fail %c2, "device not supported in the compiled configuration"
    }
    vm.export @simple_mul
    vm.export @__init
    vm.func private @__init() {
      %null = vm.const.ref.zero : !vm.ref<!hal.executable>
      %null_0 = vm.const.ref.zero : !vm.buffer
      %c2 = vm.const.i32 2
      %c7 = vm.const.i32 7
      %zero = vm.const.i32.zero
      %c1 = vm.const.i32 1
      %ref = vm.call @hal.ex.shared_device() {nosideeffects} : () -> !vm.ref<!hal.device>
      %_utf8_hal_executable_format_EAB228F999C2D3A1 = vm.const.ref.rodata @_utf8_hal_executable_format_EAB228F999C2D3A1 : !vm.buffer
      %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
      %0:2 = vm.call @hal.device.query.i32(%ref, %_utf8_hal_executable_format_EAB228F999C2D3A1, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i32)
      %1 = vm.and.i32 %0#1, %c1 : i32
      %2 = vm.select.i32 %0#0, %1, %zero : i32
      %ref_1 = vm.call.variadic @hal.descriptor_set_layout.create(%ref, %c1, [(%zero, %c7), (%c1, %c7), (%c2, %c7)]) {nosideeffects} : (!vm.ref<!hal.device>, i32, tuple<i32, i32> ...) -> !vm.ref<!hal.descriptor_set_layout>
      %ref_2 = vm.call.variadic @hal.executable_layout.create(%ref, %zero, [%ref_1]) {nosideeffects} : (!vm.ref<!hal.device>, i32, !vm.ref<!hal.descriptor_set_layout> ...) -> !vm.ref<!hal.executable_layout>
      vm.global.store.i32 %2, @_device_query_0 : i32
      vm.global.store.ref %ref_2, @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      vm.cond_br %2, ^bb1, ^bb2
    ^bb1:  // pred: ^bb0
      %_executable_layout_0 = vm.global.load.ref @_executable_layout_0 : !vm.ref<!hal.executable_layout>
      %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3 = vm.const.ref.rodata @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 : !vm.buffer
      %simple_mul_dispatch_0_embedded_elf_x86_64 = vm.const.ref.rodata @simple_mul_dispatch_0_embedded_elf_x86_64 : !vm.buffer
      %ref_4 = vm.call.variadic @hal.executable.create(%ref, %_utf8_embedded_elf_x86_64_9FD8733DA4A6F228_3, %simple_mul_dispatch_0_embedded_elf_x86_64, %null_0, [%_executable_layout_0]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer, !vm.buffer, !vm.ref<!hal.executable_layout> ...) -> !vm.ref<!hal.executable>
      vm.br ^bb3(%ref_4 : !vm.ref<!hal.executable>)
    ^bb2:  // pred: ^bb0
      vm.br ^bb3(%null : !vm.ref<!hal.executable>)
    ^bb3(%3: !vm.ref<!hal.executable>):  // 2 preds: ^bb1, ^bb2
      vm.global.store.ref %3, @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
      vm.return
    }
    vm.export @__deinit
    vm.func private @__deinit() {
      vm.return
    }
  }
 }


 // -----// IR Dump After CSE //----- //
 #device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
 module attributes {hal.device.targets = [#device_target_cpu], vm.toplevel} {
  vm.module public @module {
    vm.global.i32 private mutable @_device_query_0 : i32
    vm.global.ref private mutable @_executable_layout_0 : !vm.ref<!hal.executable_layout>
    vm.global.ref private mutable @_executable_simple_mul_dispatch_0 : !vm.ref<!hal.executable>
    vm.rodata private @simple_mul_dispatch_0_embedded_elf_x86_64 {alignment = 16 : i64, mime_type = "application/x-elf"} dense<"0x7F454C4602010100000000000000000003003E000100000030130000000000004000000000000000E00900000000000000000000400038000700400014001200060000000400000040000000000000004000000000000000400000000000000088010000000000008801000000000000080000000000000001000000040000000000000000000000000000000000000000000000000000002D030000000000002D030000000000000010000000000000010000000500000030030000000000003013000000000000301300000000000031000000000000003100000000000000001000000000000001000000060000007003000000000000702300000000000070230000000000003801000000000000380100000000000000100000000000000200000006000000E803000000000000E823000000000000E823000000000000C000000000000000C000000000000000080000000000000052E57464040000007003000000000000702300000000000070230000000000003801000000000000900C000000000000010000000000000051E574640600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000120006005013000000000000110000000000000002000000020000000000000001000000000000000000000000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279000000000000782300000000000008000000000000001003000000000000882300000000000008000000000000003013000000000000902300000000000008000000000000001003000000000000982300000000000008000000000000002C03000000000000A02300000000000008000000000000007023000000000000C02300000000000008000000000000008823000000000000C82300000000000008000000000000002803000000000000D02300000000000008000000000000009023000000000000D8230000000000000800000000000000982300000000000073696D706C655F6D756C5F64697370617463685F300000000000000000000000554889E5488B4620488B08488B5008488B40100F28010F59020F290031C05DC331C083FF02488D0D44100000480F44C1C30000000000000000000000000000000200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001E000000000000000800000000000000FBFFFF6F000000000100000000000000070000000000000038020000000000000800000000000000D80000000000000009000000000000001800000000000000F9FFFF6F0000000009000000000000000600000000000000C8010000000000000B000000000000001800000000000000050000000000000010020000000000000A0000000000000023000000000000000400000000000000F80100000000000000000000000000000000000000000000011101250E1305030E10171B0EB44219110112060000022E006E0E030E3F19200B0000032E0111011206401831130000041D00311311011206580B590B570B000000590000000400000000000801000000000200070000000000000005000000301300000000000020000000020700000007000000010330130000000000002000000001562A000000042A0000003F130000000000001100000002010100006D6C6972002F0073696D706C655F6D756C5F64697370617463685F3000280000000200000000005D0000002A00000073696D706C655F6D756C5F64697370617463685F3000000000000E0000000200000000005D0000000000000000000000000014000000FFFFFFFF040008000178100C0708900100000000240000000000000030130000000000002000000000000000410E108602430D065B0C070800000000140000000000000050130000000000001100000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000D300000000000000140000000000000000000000000000000900000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000001E00000000000000140000000000000000000000000000002800000000000000140000000000000000000000000000006F00000000000000140000000000000000000000000000000301000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000000300000000000000140000000000000000000000000000006F0000000000000014000000000000000000000000000000B8000000000000001400000000000000000000000000000052010000000000001400000000000000000000000000000042030000000000009E00000004007F000000010101FB0E0D0001010101000000010000012F7573722F6C6F63616C2F676F6F676C652F686F6D652F62656E76616E696B2F7372632F6972656500003C756E6B6E6F776E3E0001000072756E74696D652F7372632F697265652F72756E74696D652F74657374646174612F73696D706C655F6D756C2E6D6C69720001000000000902301300000000000011040205010A4B0508AD080001014952454500000000000000000000000000000000000000000000000000002300000000020800E8230000000000000000000000000000010000001200060050130000000000001100000000000000002E64796E73796D002E68617368002E64796E737472002E72656C612E64796E002E726F64617461002E74657874002E646174612E72656C2E726F002E64796E616D6963002E64656275675F616262726576002E64656275675F696E666F002E64656275675F737472002E64656275675F7075626E616D6573002E64656275675F7075627479706573002E64656275675F6672616D65002E64656275675F6C696E65002E636F6D6D656E74002E73796D746162002E7368737472746162002E7374727461620000697265655F68616C5F65786563757461626C655F6C6962726172795F7175657279005F44594E414D49430000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000B0000000200000000000000C801000000000000C801000000000000300000000000000003000000010000000800000000000000180000000000000009000000050000000200000000000000F801000000000000F80100000000000018000000000000000100000000000000040000000000000004000000000000000F0000000300000002000000000000001002000000000000100200000000000023000000000000000000000000000000010000000000000000000000000000001700000004000000020000000000000038020000000000003802000000000000D80000000000000001000000000000000800000000000000180000000000000021000000010000000200000000000000100300000000000010030000000000001D00000000000000000000000000000008000000000000000000000000000000290000000100000006000000000000003013000000000000300300000000000031000000000000000000000000000000100000000000000000000000000000002F0000000100000003000000000000007023000000000000700300000000000078000000000000000000000000000000100000000000000000000000000000003C000000060000000300000000000000E823000000000000E803000000000000C000000000000000030000000000000008000000000000001000000000000000450000000100000000000000000000000000000000000000A8040000000000004200000000000000000000000000000001000000000000000000000000000000530000000100000000000000000000000000000000000000EA040000000000005D000000000000000000000000000000010000000000000000000000000000005F000000010000003000000000000000000000000000000047050000000000001D000000000000000000000000000000010000000000000001000000000000006A000000010000000000000000000000000000000000000064050000000000002C000000000000000000000000000000010000000000000000000000000000007A0000000100000000000000000000000000000000000000900500000000000012000000000000000000000000000000010000000000000000000000000000008A0000000100000000000000000000000000000000000000A8050000000000005002000000000000000000000000000008000000000000000000000000000000970000000100000000000000000000000000000000000000F807000000000000A200000000000000000000000000000001000000000000000000000000000000A300000001000000300000000000000000000000000000009A080000000000000500000000000000000000000000000001000000000000000100000000000000AC0000000200000000000000000000000000000000000000A0080000000000004800000000000000130000000200000008000000000000001800000000000000B40000000300000000000000000000000000000000000000E808000000000000C600000000000000000000000000000001000000000000000000000000000000BE0000000300000000000000000000000000000000000000AE090000000000002C00000000000000000000000000000001000000000000000000000000000000"> : vector<3808xi8>
    vm.rodata private @_utf8_hal_executable_format_EAB228F999C2D3A1 {alignment = 1 : i64} dense<[104, 97, 108, 46, 101, 120, 101, 99, 117, 116, 97, 98, 108, 101, 46, 102, 111, 114, 109, 97, 116]> : vector<21xi8>
    vm.rodata private @_utf8_embedded_elf_x86_64_9FD8733DA4A6F228 {alignment = 1 : i64} dense<[101, 109, 98, 101, 100, 100, 101, 100, 45, 101, 108, 102, 45, 120, 56, 54, 95, 54, 52]> : vector<19xi8>
    vm.import @hal.ex.shared_device() -> !vm.ref<!hal.device> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.ex.submit_and_wait(%device : !vm.ref<!hal.device>, %command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.allocator.allocate(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %allocation_size : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.map.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %try : i32, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.allocator.wrap.byte_buffer(%allocator : !vm.ref<!hal.allocator>, %memory_types : i32, %buffer_usage : i32, %source : !vm.buffer, %offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {sym_visibility = "private"}
    vm.import @hal.buffer.assert(%buffer : !vm.ref<!hal.buffer>, %message : !vm.buffer, %allocator : !vm.ref<!hal.allocator>, %minimum_length : i64, %memory_types : i32, %buffer_usage : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer.subspan(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i64) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.length(%buffer : !vm.ref<!hal.buffer>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer.load(%source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %length : i32) -> i32 attributes {sym_visibility = "private"}
    vm.import @hal.buffer.store(%value : i32, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.create(%buffer : !vm.ref<!hal.buffer>, %element_type : i32, %encoding_type : i32, %shape : i64 ...) -> !vm.ref<!hal.buffer_view> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.assert(%buffer_view : !vm.ref<!hal.buffer_view>, %message : !vm.buffer, %element_type : i32, %encoding_type : i32, %shape : i64 ...) attributes {sym_visibility = "private"}
    vm.import @hal.buffer_view.buffer(%buffer_view : !vm.ref<!hal.buffer_view>) -> !vm.ref<!hal.buffer> attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.byte_length(%buffer_view : !vm.ref<!hal.buffer_view>) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.element_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.encoding_type(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.rank(%buffer_view : !vm.ref<!hal.buffer_view>) -> i32 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.dim(%buffer_view : !vm.ref<!hal.buffer_view>, %index : i32) -> i64 attributes {nosideeffects, sym_visibility = "private"}
    vm.import @hal.buffer_view.trace(%key : !vm.buffer, %operands : !vm.ref<!hal.buffer_view> ...) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.create(%device : !vm.ref<!hal.device>, %modes : i32, %command_categories : i32) -> !vm.ref<!hal.command_buffer> attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.begin_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>, %label : !vm.buffer) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.end_debug_group(%command_buffer : !vm.ref<!hal.command_buffer>) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.execution_barrier(%command_buffer : !vm.ref<!hal.command_buffer>, %source_stage_mask : i32, %target_stage_mask : i32, %flags : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.fill_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64, %pattern : i32, %pattern_length : i32) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.copy_buffer(%command_buffer : !vm.ref<!hal.command_buffer>, %source_buffer : !vm.ref<!hal.buffer>, %source_offset : i64, %target_buffer : !vm.ref<!hal.buffer>, %target_offset : i64, %length : i64) attributes {sym_visibility = "private"}
    vm.import @hal.command_buffer.push_constants(%command_buffer : !vm.ref<!hal.command_buffer>, %executable_layout : !vm.ref<!hal.executa
diff --git a/simple_mul.txt b/simple_mul.txt
	{
	"name": "(gdb) iree-compile",
	"type": "cppdbg",
	"request": "launch",
	"preLaunchTask": "build-iree-compile",
	"program": "${command:cmake.buildDirectory}/tools/iree-compile",
	"args": [
	// "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
	"-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
	"-iree-vm-emit-polyglot-zip=true",
	// "-mlir-elide-elementsattrs-if-larger=8192",
	"-mlir-disable-threading",
	// "-mlir-print-ir-before-all",
	// "-mlir-print-ir-after-all",
	// "-iree-hal-dump-executable-sources-to=${workspaceFolder}/../iree-tmp/executables/",
	"-iree-hal-target-backends=dylib-llvm-aot",
	"-iree-llvm-target-triple=x86_64-pc-linux-elf",
	"-iree-llvm-link-embedded",
	// "-iree-llvm-keep-linker-artifacts",

	// "-iree-input-type=tosa",
	"-iree-input-type=mhlo",

	"${workspaceFolder}/runtime/src/iree/runtime/testdata/simple_mul.mlir",
	"-o=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",

	// "-iree-input-type=mhlo",
	// "${workspaceFolder}/iree/test/e2e/models/mobilenetv3_fake_weights.mlir",
	// "-o=${workspaceFolder}/../iree-tmp/mobilenetv3_fake_weights.vmfb",
	// "${workspaceFolder}/iree/test/e2e/models/unidirectional_lstm.mlir",
	// "-o=${workspaceFolder}/../iree-tmp/unidirectional_lstm.vmfb",

	">",
	"${workspaceFolder}/../iree-tmp/iree-compile-out.txt",
	"2>&1"
	],
	"stopAtEntry": false,
	"cwd": "${workspaceFolder}",
	// "internalConsoleOptions": "openOnSessionStart",
	"externalConsole": false,
	"MIMode": "gdb",
	"setupCommands": [
	{
	"description": "Enable pretty-printing for gdb",
	"text": "-enable-pretty-printing",
	"ignoreFailures": true
	}
	],
	// "visualizerFile": "${workspaceFolder}/iree.natvis"
	},
	{
	"name": "(lldb) iree-run-module",
	"type": "cppdbg",
	// "type": "lldb",
	"request": "launch",
	"preLaunchTask": "build-iree-run-module",
	"program": "${command:cmake.buildDirectory}/tools/iree-run-module",
	"args": [
	"--trace_execution",

	"--device=local-sync",

	"--module_file=${workspaceFolder}/../iree-tmp/simple_mul.vmfb",
	"--entry_function=simple_mul",
	"--function_input=4xf32=12",
	"--function_input=4xf32=2",

	">",
	"${workspaceFolder}/../iree-tmp/iree-run-module-out.txt",
	"2>&1"
	],
	"stopAtEntry": false,
	"cwd": "${workspaceFolder}",
	"environment": [
	{
	"name": "TRACY_NO_EXIT",
	"value": "1",
	}
	],
	// "internalConsoleOptions": "openOnSessionStart",
	"externalConsole": false,
	"MIMode": "gdb",
	// "MIMode": "lldb",
	// "miDebuggerPath": "/usr/bin/lldb",
	"setupCommands": [
	{
	"description": "Enable pretty-printing for gdb",
	"text": "-enable-pretty-printing",
	"ignoreFailures": true
	}
	],
	"visualizerFile": "${workspaceFolder}/iree.natvis"
	},