wangkuiyi · February 22, 2023 17:57
diff --git a/after-fixing-12280.txt b/after-fixing-12280.txt
 09:56 $ python model.py | iree-compile -     --iree-hal-target-backends=llvm-cpu    --iree-input-type=mhlo    --iree-mhlo-demote-i64-to-i32=false > /tmp/x.vmfb
 <stdin>:1226:13: error: 'iree_linalg_ext.scatter' op indexed shape of update value dim#1 exceeds original value at dim#0 64 -9223372036854775808
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1226:13: note: see current operation:
 %19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
 ^bb0(%arg3: f32, %arg4: f32):
  %20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
  "iree_linalg_ext.yield"(%20) : (f32) -> ()
 }) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
 <stdin>:1226:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1226:13: note: see current operation:
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
    %0 = "affine.apply"(%arg2) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
    "hal.return"(%arg3, %0, %arg1) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_100_fill_512x501", translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
  "builtin.module"() ({
    "func.func"() ({
      %0 = "arith.constant"() {value = dense<-0.001953125> : tensor<1x64x1xf32>} : () -> tensor<1x64x1xf32>
      %1 = "arith.constant"() {value = 64 : index} : () -> index
      %2 = "arith.constant"() {value = 512 : index} : () -> index
      %3 = "arith.constant"() {value = 1 : index} : () -> index
      %4 = "arith.constant"() {value = 15124736 : index} : () -> index
      %5 = "arith.constant"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
      %6 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
      %7 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
      %8 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %9 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %10 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %11 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %12 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index
      %13 = "hal.interface.workgroup.count"() {dimension = 2 : index} : () -> index
      "scf.for"(%12, %3, %13) ({
      ^bb0(%arg0: index):
        %14 = "affine.apply"(%10) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
        %15 = "affine.apply"(%11) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
        "scf.for"(%14, %2, %15) ({
        ^bb0(%arg1: index):
          "scf.for"(%8, %3, %9) ({
          ^bb0(%arg2: index):
            %16 = "tensor.empty"() : () -> tensor<64x1xf32>
            %17 = "tensor.cast"(%16) : (tensor<64x1xf32>) -> tensor<?x1xf32>
            %18 = "linalg.fill"(%6, %17) ({
            ^bb0(%arg3: f32, %arg4: f32):
              "linalg.yield"(%arg3) : (f32) -> ()
            }) {operand_segment_sizes = array<i32: 1, 1>} : (f32, tensor<?x1xf32>) -> tensor<?x1xf32>
            %19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
            ^bb0(%arg3: f32, %arg4: f32):
              %20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
              "iree_linalg_ext.yield"(%20) : (f32) -> ()
            }) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
            "flow.dispatch.tensor.store"(%19, %7, %arg1, %arg2, %1) {operand_segment_sizes = array<i32: 1, 1, 0, 2, 1, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: 1, 1>} : (tensor<?x1xf32>, !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {function_type = () -> (), sym_name = "_train_step_dispatch_100_fill_512x501"} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
 <stdin>:1226:13: error: failed to serialize executables
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1226:13: note: see current operation:
 "hal.executable"() ({
  "hal.executable.variant"() ({
    "hal.executable.export"() ({
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
      %0 = "affine.apply"(%arg2) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
      "hal.return"(%arg3, %0, %arg1) : (index, index, index) -> ()
    }) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_100_fill_512x501", translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
    "builtin.module"() ({
      "func.func"() ({
        %0 = "arith.constant"() {value = dense<-0.001953125> : tensor<1x64x1xf32>} : () -> tensor<1x64x1xf32>
        %1 = "arith.constant"() {value = 64 : index} : () -> index
        %2 = "arith.constant"() {value = 512 : index} : () -> index
        %3 = "arith.constant"() {value = 1 : index} : () -> index
        %4 = "arith.constant"() {value = 15124736 : index} : () -> index
        %5 = "arith.constant"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
        %6 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
        %7 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
        %8 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
        %9 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
        %10 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
        %11 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
        %12 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index
        %13 = "hal.interface.workgroup.count"() {dimension = 2 : index} : () -> index
        "scf.for"(%12, %3, %13) ({
        ^bb0(%arg0: index):
          %14 = "affine.apply"(%10) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
          %15 = "affine.apply"(%11) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
          "scf.for"(%14, %2, %15) ({
          ^bb0(%arg1: index):
            "scf.for"(%8, %3, %9) ({
            ^bb0(%arg2: index):
              %16 = "tensor.empty"() : () -> tensor<64x1xf32>
              %17 = "tensor.cast"(%16) : (tensor<64x1xf32>) -> tensor<?x1xf32>
              %18 = "linalg.fill"(%6, %17) ({
              ^bb0(%arg3: f32, %arg4: f32):
                "linalg.yield"(%arg3) : (f32) -> ()
              }) {operand_segment_sizes = array<i32: 1, 1>} : (f32, tensor<?x1xf32>) -> tensor<?x1xf32>
              %19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
              ^bb0(%arg3: f32, %arg4: f32):
                %20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
                "iree_linalg_ext.yield"(%20) : (f32) -> ()
              }) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
              "flow.dispatch.tensor.store"(%19, %7, %arg1, %arg2, %1) {operand_segment_sizes = array<i32: 1, 1, 0, 2, 1, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: 1, 1>} : (tensor<?x1xf32>, !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>, index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "func.return"() : () -> ()
      }) {function_type = () -> (), sym_name = "_train_step_dispatch_100_fill_512x501"} : () -> ()
    }) : () -> ()
    "hal.executable.variant_end"() : () -> ()
  }) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
  "hal.executable_end"() : () -> ()
 }) {sym_name = "_train_step_dispatch_100", sym_visibility = "private"} : () -> ()
    %1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
            ^
	09:56 $ python model.py \| iree-compile - --iree-hal-target-backends=llvm-cpu --iree-input-type=mhlo --iree-mhlo-demote-i64-to-i32=false > /tmp/x.vmfb
	<stdin>:1226:13: error: 'iree_linalg_ext.scatter' op indexed shape of update value dim#1 exceeds original value at dim#0 64 -9223372036854775808
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1226:13: note: see current operation:
	%19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
	^bb0(%arg3: f32, %arg4: f32):
	%20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	"iree_linalg_ext.yield"(%20) : (f32) -> ()
	}) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^
	<stdin>:1226:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1226:13: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
	%0 = "affine.apply"(%arg2) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
	"hal.return"(%arg3, %0, %arg1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_100_fill_512x501", translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = dense<-0.001953125> : tensor<1x64x1xf32>} : () -> tensor<1x64x1xf32>
	%1 = "arith.constant"() {value = 64 : index} : () -> index
	%2 = "arith.constant"() {value = 512 : index} : () -> index
	%3 = "arith.constant"() {value = 1 : index} : () -> index
	%4 = "arith.constant"() {value = 15124736 : index} : () -> index
	%5 = "arith.constant"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
	%6 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
	%7 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
	%8 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%9 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
	%10 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%11 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
	%12 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index
	%13 = "hal.interface.workgroup.count"() {dimension = 2 : index} : () -> index
	"scf.for"(%12, %3, %13) ({
	^bb0(%arg0: index):
	%14 = "affine.apply"(%10) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
	%15 = "affine.apply"(%11) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
	"scf.for"(%14, %2, %15) ({
	^bb0(%arg1: index):
	"scf.for"(%8, %3, %9) ({
	^bb0(%arg2: index):
	%16 = "tensor.empty"() : () -> tensor<64x1xf32>
	%17 = "tensor.cast"(%16) : (tensor<64x1xf32>) -> tensor<?x1xf32>
	%18 = "linalg.fill"(%6, %17) ({
	^bb0(%arg3: f32, %arg4: f32):
	"linalg.yield"(%arg3) : (f32) -> ()
	}) {operand_segment_sizes = array<i32: 1, 1>} : (f32, tensor<?x1xf32>) -> tensor<?x1xf32>
	%19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
	^bb0(%arg3: f32, %arg4: f32):
	%20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	"iree_linalg_ext.yield"(%20) : (f32) -> ()
	}) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
	"flow.dispatch.tensor.store"(%19, %7, %arg1, %arg2, %1) {operand_segment_sizes = array<i32: 1, 1, 0, 2, 1, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: 1, 1>} : (tensor<?x1xf32>, !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>, index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "_train_step_dispatch_100_fill_512x501"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^
	<stdin>:1226:13: error: failed to serialize executables
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi64>, tensor<512xi64>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi64>, tensor<11xi64>, tensor<256xi64>, tensor<256xi64>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1226:13: note: see current operation:
	"hal.executable"() ({
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
	%0 = "affine.apply"(%arg2) {map = affine_map<()[s0] -> (s0 ceildiv 64)>} : (index) -> index
	"hal.return"(%arg3, %0, %arg1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_100_fill_512x501", translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = dense<-0.001953125> : tensor<1x64x1xf32>} : () -> tensor<1x64x1xf32>
	%1 = "arith.constant"() {value = 64 : index} : () -> index
	%2 = "arith.constant"() {value = 512 : index} : () -> index
	%3 = "arith.constant"() {value = 1 : index} : () -> index
	%4 = "arith.constant"() {value = 15124736 : index} : () -> index
	%5 = "arith.constant"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
	%6 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
	%7 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
	%8 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%9 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
	%10 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%11 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
	%12 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index
	%13 = "hal.interface.workgroup.count"() {dimension = 2 : index} : () -> index
	"scf.for"(%12, %3, %13) ({
	^bb0(%arg0: index):
	%14 = "affine.apply"(%10) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
	%15 = "affine.apply"(%11) {map = affine_map<()[s0] -> (s0 * 64)>} : (index) -> index
	"scf.for"(%14, %2, %15) ({
	^bb0(%arg1: index):
	"scf.for"(%8, %3, %9) ({
	^bb0(%arg2: index):
	%16 = "tensor.empty"() : () -> tensor<64x1xf32>
	%17 = "tensor.cast"(%16) : (tensor<64x1xf32>) -> tensor<?x1xf32>
	%18 = "linalg.fill"(%6, %17) ({
	^bb0(%arg3: f32, %arg4: f32):
	"linalg.yield"(%arg3) : (f32) -> ()
	}) {operand_segment_sizes = array<i32: 1, 1>} : (f32, tensor<?x1xf32>) -> tensor<?x1xf32>
	%19 = "iree_linalg_ext.scatter"(%0, %5, %18) ({
	^bb0(%arg3: f32, %arg4: f32):
	%20 = "arith.addf"(%arg4, %arg3) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	"iree_linalg_ext.yield"(%20) : (f32) -> ()
	}) {dimension_map = array<i64: 1>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 1]]>, operand_segment_sizes = array<i32: 2, 1>, unique_indices = true} : (tensor<1x64x1xf32>, tensor<1x1xi32>, tensor<?x1xf32>) -> tensor<?x1xf32>
	"flow.dispatch.tensor.store"(%19, %7, %arg1, %arg2, %1) {operand_segment_sizes = array<i32: 1, 1, 0, 2, 1, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: -9223372036854775808, 1>, static_strides = array<i64: 1, 1>} : (tensor<?x1xf32>, !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>, index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "_train_step_dispatch_100_fill_512x501"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
	"hal.executable_end"() : () -> ()
	}) {sym_name = "_train_step_dispatch_100", sym_visibility = "private"} : () -> ()
	%1181 = "stablehlo.scatter"(%1180, %1165, %1178) ({
	^