AmosLewis · November 20, 2024 03:43
diff --git a/error_after_fix_dynamic_end.mlir b/error_after_fix_dynamic_end.mlir
 iree-compile --iree-hal-target-backends=llvm-cpu model.linalg.mlir -o model.vmfb --dump-compilation-phases-to=./tmp/
 failed to translate executables
 failed to translate executables
 model.linalg.mlir:21:10: error: 'memref.alloca' op expected no unbounded stack allocations
    %1 = tensor.empty(%dim) : tensor<?xi64>
         ^
 model.linalg.mlir:10:3: note: called from
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
  ^
 model.linalg.mlir:21:10: note: see current operation: %14 = "memref.alloca"(%11) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
    %1 = tensor.empty(%dim) : tensor<?xi64>
         ^
 model.linalg.mlir:32:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %7:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%2 : tensor<?xi64>) outs(%4, %6 : tensor<?xi64>, tensor<i64>) {
           ^
 model.linalg.mlir:10:3: note: called from
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
  ^
 model.linalg.mlir:32:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg3: !hal.device, %arg4: index):
    %24 = "arith.constant"() <{value = 1 : index}> : () -> index
    %25 = "arith.constant"() <{value = 1 : index}> : () -> index
    %26 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%24, %25, %26) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph_dispatch_0_scan_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_0_scan_Dxi64_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 1 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %3 = "arith.constant"() <{value = 0 : index}> : () -> index
      %4 = "arith.constant"() <{value = 64 : index}> : () -> index
      %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %6 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %7 = "arith.extui"(%5) : (i32) -> i64
      %8 = "arith.extui"(%6) : (i32) -> i64
      %9 = "arith.shli"(%8, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %10 = "arith.ori"(%7, %9) : (i64, i64) -> i64
      %11 = "arith.index_castui"(%10) : (i64) -> index
      %12 = "hal.interface.binding.subspan"(%3, %11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi8>
      "memref.assume_alignment"(%12) <{alignment = 64 : i32}> : (memref<?xi8>) -> ()
      %13 = "hal.interface.binding.subspan"(%4, %11) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64, strided<[1], offset: 8>>
      "memref.assume_alignment"(%13) <{alignment = 64 : i32}> : (memref<?xi64, strided<[1], offset: 8>>) -> ()
      %14 = "memref.alloca"(%11) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
      "scf.for"(%3, %11, %0) ({
      ^bb0(%arg2: index):
        %21 = "memref.load"(%12, %arg2) <{nontemporal = false}> : (memref<?xi8>, index) -> i8
        %22 = "arith.trunci"(%21) : (i8) -> i1
        %23 = "arith.extui"(%22) : (i1) -> i64
        "memref.store"(%23, %14, %arg2) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %11, %0) ({
      ^bb0(%arg1: index):
        "memref.store"(%1, %13, %arg1) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%3, %11, %0) ({
      ^bb0(%arg0: index):
        %15 = "arith.cmpi"(%arg0, %3) <{predicate = 0 : i64}> : (index, index) -> i1
        "scf.if"(%15) ({
          %20 = "memref.load"(%14, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          "memref.store"(%20, %13, %arg0) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
          "scf.yield"() : () -> ()
        }, {
          %16 = "arith.subi"(%arg0, %0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
          %17 = "memref.load"(%13, %16) <{nontemporal = false}> : (memref<?xi64, strided<[1], offset: 8>>, index) -> i64
          %18 = "memref.load"(%14, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
          %19 = "arith.addi"(%17, %18) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
          "memref.store"(%19, %13, %arg0) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
          "scf.yield"() : () -> ()
        }) : (i1) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %7:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%2 : tensor<?xi64>) outs(%4, %6 : tensor<?xi64>, tensor<i64>) {
           ^
 model.linalg.mlir:21:10: error: 'memref.alloca' op expected no unbounded stack allocations
    %1 = tensor.empty(%dim) : tensor<?xi64>
         ^
 model.linalg.mlir:10:3: note: called from
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
  ^
 model.linalg.mlir:21:10: note: see current operation: %21 = "memref.alloca"(%18) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
    %1 = tensor.empty(%dim) : tensor<?xi64>
         ^
 model.linalg.mlir:85:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %23 = tm_tensor.scatter {dimension_map = array<i64: 0>} unique_indices(false) ins(%21, %inserted_slice : tensor<?xi64>, tensor<?x1xi32>) outs(%17 : tensor<?xi64>) {
          ^
 model.linalg.mlir:10:3: note: called from
  func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
  ^
 model.linalg.mlir:85:11: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg2: !hal.device, %arg3: index):
    %25 = "arith.constant"() <{value = 1 : index}> : () -> index
    %26 = "arith.constant"() <{value = 1 : index}> : () -> index
    %27 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%25, %26, %27) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph_dispatch_5_scatter_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_5_scatter_Dxi64_dispatch_tensor_store"}> ({
      %0 = "arith.constant"() <{value = 1 : index}> : () -> index
      %1 = "arith.constant"() <{value = 0 : index}> : () -> index
      %2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %3 = "arith.constant"() <{value = 0 : i64}> : () -> i64
      %4 = "arith.constant"() <{value = 64 : index}> : () -> index
      %5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %6 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %7 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %8 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %9 = "arith.extui"(%5) : (i32) -> i64
      %10 = "arith.extui"(%6) : (i32) -> i64
      %11 = "arith.shli"(%10, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %12 = "arith.ori"(%9, %11) : (i64, i64) -> i64
      %13 = "arith.index_castui"(%12) : (i64) -> index
      %14 = "arith.extui"(%7) : (i32) -> i64
      %15 = "arith.extui"(%8) : (i32) -> i64
      %16 = "arith.shli"(%15, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %17 = "arith.ori"(%14, %16) : (i64, i64) -> i64
      %18 = "arith.index_castui"(%17) : (i64) -> index
      %19 = "hal.interface.binding.subspan"(%13, %18) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x1xi32, strided<[1, 1], offset: ?>>
      "memref.assume_alignment"(%19) <{alignment = 1 : i32}> : (memref<?x1xi32, strided<[1, 1], offset: ?>>) -> ()
      %20 = "hal.interface.binding.subspan"(%4, %18) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64, strided<[1], offset: 8>>
      "memref.assume_alignment"(%20) <{alignment = 64 : i32}> : (memref<?xi64, strided<[1], offset: 8>>) -> ()
      %21 = "memref.alloca"(%18) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
      "scf.for"(%1, %18, %0) ({
      ^bb0(%arg1: index):
        "memref.store"(%3, %21, %arg1) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%1, %18, %0) ({
      ^bb0(%arg0: index):
        %22 = "memref.load"(%21, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
        %23 = "memref.load"(%19, %arg0, %1) <{nontemporal = false}> : (memref<?x1xi32, strided<[1, 1], offset: ?>>, index, index) -> i32
        %24 = "arith.index_cast"(%23) : (i32) -> index
        "memref.store"(%22, %20, %24) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %23 = tm_tensor.scatter {dimension_map = array<i64: 0>} unique_indices(false) ins(%21, %inserted_slice : tensor<?xi64>, tensor<?x1xi32>) outs(%17 : tensor<?xi64>) {
	iree-compile --iree-hal-target-backends=llvm-cpu model.linalg.mlir -o model.vmfb --dump-compilation-phases-to=./tmp/
	failed to translate executables
	failed to translate executables
	model.linalg.mlir:21:10: error: 'memref.alloca' op expected no unbounded stack allocations
	%1 = tensor.empty(%dim) : tensor<?xi64>
	^
	model.linalg.mlir:10:3: note: called from
	func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
	^
	model.linalg.mlir:21:10: note: see current operation: %14 = "memref.alloca"(%11) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
	%1 = tensor.empty(%dim) : tensor<?xi64>
	^
	model.linalg.mlir:32:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	%7:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%2 : tensor<?xi64>) outs(%4, %6 : tensor<?xi64>, tensor<i64>) {
	^
	model.linalg.mlir:10:3: note: called from
	func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
	^
	model.linalg.mlir:32:12: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg3: !hal.device, %arg4: index):
	%24 = "arith.constant"() <{value = 1 : index}> : () -> index
	%25 = "arith.constant"() <{value = 1 : index}> : () -> index
	%26 = "arith.constant"() <{value = 1 : index}> : () -> index
	"hal.return"(%24, %25, %26) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph_dispatch_0_scan_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_0_scan_Dxi64_dispatch_tensor_store"}> ({
	%0 = "arith.constant"() <{value = 1 : index}> : () -> index
	%1 = "arith.constant"() <{value = 0 : i64}> : () -> i64
	%2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
	%3 = "arith.constant"() <{value = 0 : index}> : () -> index
	%4 = "arith.constant"() <{value = 64 : index}> : () -> index
	%5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
	%6 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
	%7 = "arith.extui"(%5) : (i32) -> i64
	%8 = "arith.extui"(%6) : (i32) -> i64
	%9 = "arith.shli"(%8, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
	%10 = "arith.ori"(%7, %9) : (i64, i64) -> i64
	%11 = "arith.index_castui"(%10) : (i64) -> index
	%12 = "hal.interface.binding.subspan"(%3, %11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi8>
	"memref.assume_alignment"(%12) <{alignment = 64 : i32}> : (memref<?xi8>) -> ()
	%13 = "hal.interface.binding.subspan"(%4, %11) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64, strided<[1], offset: 8>>
	"memref.assume_alignment"(%13) <{alignment = 64 : i32}> : (memref<?xi64, strided<[1], offset: 8>>) -> ()
	%14 = "memref.alloca"(%11) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
	"scf.for"(%3, %11, %0) ({
	^bb0(%arg2: index):
	%21 = "memref.load"(%12, %arg2) <{nontemporal = false}> : (memref<?xi8>, index) -> i8
	%22 = "arith.trunci"(%21) : (i8) -> i1
	%23 = "arith.extui"(%22) : (i1) -> i64
	"memref.store"(%23, %14, %arg2) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.for"(%3, %11, %0) ({
	^bb0(%arg1: index):
	"memref.store"(%1, %13, %arg1) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.for"(%3, %11, %0) ({
	^bb0(%arg0: index):
	%15 = "arith.cmpi"(%arg0, %3) <{predicate = 0 : i64}> : (index, index) -> i1
	"scf.if"(%15) ({
	%20 = "memref.load"(%14, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
	"memref.store"(%20, %13, %arg0) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
	"scf.yield"() : () -> ()
	}, {
	%16 = "arith.subi"(%arg0, %0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
	%17 = "memref.load"(%13, %16) <{nontemporal = false}> : (memref<?xi64, strided<[1], offset: 8>>, index) -> i64
	%18 = "memref.load"(%14, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
	%19 = "arith.addi"(%17, %18) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
	"memref.store"(%19, %13, %arg0) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
	"scf.yield"() : () -> ()
	}) : (i1) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
	%7:2 = tm_tensor.scan dimension(0) inclusive(true) ins(%2 : tensor<?xi64>) outs(%4, %6 : tensor<?xi64>, tensor<i64>) {
	^
	model.linalg.mlir:21:10: error: 'memref.alloca' op expected no unbounded stack allocations
	%1 = tensor.empty(%dim) : tensor<?xi64>
	^
	model.linalg.mlir:10:3: note: called from
	func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
	^
	model.linalg.mlir:21:10: note: see current operation: %21 = "memref.alloca"(%18) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
	%1 = tensor.empty(%dim) : tensor<?xi64>
	^
	model.linalg.mlir:85:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
	%23 = tm_tensor.scatter {dimension_map = array<i64: 0>} unique_indices(false) ins(%21, %inserted_slice : tensor<?xi64>, tensor<?x1xi32>) outs(%17 : tensor<?xi64>) {
	^
	model.linalg.mlir:10:3: note: called from
	func.func @main_graph(%arg0: tensor<?xi1>) -> tensor<1x1xi64> {
	^
	model.linalg.mlir:85:11: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg2: !hal.device, %arg3: index):
	%25 = "arith.constant"() <{value = 1 : index}> : () -> index
	%26 = "arith.constant"() <{value = 1 : index}> : () -> index
	%27 = "arith.constant"() <{value = 1 : index}> : () -> index
	"hal.return"(%25, %26, %27) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph_dispatch_5_scatter_Dxi64_dispatch_tensor_store", workgroup_size = [1 : index, 1 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_5_scatter_Dxi64_dispatch_tensor_store"}> ({
	%0 = "arith.constant"() <{value = 1 : index}> : () -> index
	%1 = "arith.constant"() <{value = 0 : index}> : () -> index
	%2 = "arith.constant"() <{value = 32 : i64}> : () -> i64
	%3 = "arith.constant"() <{value = 0 : i64}> : () -> i64
	%4 = "arith.constant"() <{value = 64 : index}> : () -> index
	%5 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
	%6 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
	%7 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
	%8 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
	%9 = "arith.extui"(%5) : (i32) -> i64
	%10 = "arith.extui"(%6) : (i32) -> i64
	%11 = "arith.shli"(%10, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
	%12 = "arith.ori"(%9, %11) : (i64, i64) -> i64
	%13 = "arith.index_castui"(%12) : (i64) -> index
	%14 = "arith.extui"(%7) : (i32) -> i64
	%15 = "arith.extui"(%8) : (i32) -> i64
	%16 = "arith.shli"(%15, %2) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
	%17 = "arith.ori"(%14, %16) : (i64, i64) -> i64
	%18 = "arith.index_castui"(%17) : (i64) -> index
	%19 = "hal.interface.binding.subspan"(%13, %18) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?x1xi32, strided<[1, 1], offset: ?>>
	"memref.assume_alignment"(%19) <{alignment = 1 : i32}> : (memref<?x1xi32, strided<[1, 1], offset: ?>>) -> ()
	%20 = "hal.interface.binding.subspan"(%4, %18) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 4, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> memref<?xi64, strided<[1], offset: 8>>
	"memref.assume_alignment"(%20) <{alignment = 64 : i32}> : (memref<?xi64, strided<[1], offset: 8>>) -> ()
	%21 = "memref.alloca"(%18) <{alignment = 64 : i64, operandSegmentSizes = array<i32: 1, 0>}> : (index) -> memref<?xi64>
	"scf.for"(%1, %18, %0) ({
	^bb0(%arg1: index):
	"memref.store"(%3, %21, %arg1) <{nontemporal = false}> : (i64, memref<?xi64>, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.for"(%1, %18, %0) ({
	^bb0(%arg0: index):
	%22 = "memref.load"(%21, %arg0) <{nontemporal = false}> : (memref<?xi64>, index) -> i64
	%23 = "memref.load"(%19, %arg0, %1) <{nontemporal = false}> : (memref<?x1xi32, strided<[1, 1], offset: ?>>, index, index) -> i32
	%24 = "arith.index_cast"(%23) : (i32) -> index
	"memref.store"(%22, %20, %24) <{nontemporal = false}> : (i64, memref<?xi64, strided<[1], offset: 8>>, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
	%23 = tm_tensor.scatter {dimension_map = array<i64: 0>} unique_indices(false) ins(%21, %inserted_slice : tensor<?xi64>, tensor<?x1xi32>) outs(%17 : tensor<?xi64>) {