wangkuiyi · February 17, 2023 06:36
diff --git a/compile-error.txt b/compile-error.txt
 22:08 $ python model.py | iree-compile -  --iree-hal-target-backends=llvm-cpu --iree-input-type=mhlo --iree-mhlo-demote-i64-to-i32=false > /tmp/x.vmfb
 <stdin>:1192:13: error: 'vector.broadcast' op dimension mismatch (4 vs. 0)
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1192:13: note: see current operation: %32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
 <stdin>:1192:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1192:13: note: see current operation:
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device, %arg1: index):
    %0 = "arith.constant"() {value = 32 : index} : () -> index
    %1 = "arith.constant"() {value = 1 : index} : () -> index
    "hal.return"(%0, %1, %1) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_94_generic_128000", translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} : () -> ()
  "builtin.module"() ({
    "func.func"() ({
      %0 = "arith.constant"() {value = dense<true> : vector<-9223372036854775808xi1>} : () -> vector<-9223372036854775808xi1>
      %1 = "arith.constant"() {value = dense<0.000000e+00> : vector<-9223372036854775808xf32>} : () -> vector<-9223372036854775808xf32>
      %2 = "arith.constant"() {value = 0 : i32} : () -> i32
      %3 = "arith.constant"() {value = dense<true> : vector<4xi1>} : () -> vector<4xi1>
      %4 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32>
      %5 = "arith.constant"() {value = 4 : index} : () -> index
      %6 = "arith.constant"() {value = 0 : index} : () -> index
      %7 = "arith.constant"() {value = 128000 : index} : () -> index
      %8 = "arith.constant"() {value = 12050688 : index} : () -> index
      %9 = "arith.constant"() {value = 10490112 : index} : () -> index
      %10 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
      %11 = "arith.index_castui"(%10) {stream.alignment = 64 : index, stream.values = [64 : index, 8000000 : index]} : (i32) -> index
      %12 = "hal.interface.binding.subspan"(%11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1000000xf32>>
      %13 = "hal.interface.binding.subspan"(%9) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<128000xi32>>
      %14 = "hal.interface.binding.subspan"(%8) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<128000xf32>>
      %15 = "flow.dispatch.tensor.load"(%12) {operand_segment_sizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 1000000>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<1000000xf32>>) -> tensor<1000000xf32>
      %16 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %17 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %18 = "affine.apply"(%16) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
      %19 = "affine.apply"(%17) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
      "scf.for"(%18, %7, %19) ({
      ^bb0(%arg0: index):
        %20 = "affine.min"(%arg0) {map = affine_map<(d0) -> (-d0 + 128000, 4096)>} : (index) -> index
        %21 = "flow.dispatch.tensor.load"(%14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> tensor<?xf32>
        %22 = "flow.dispatch.tensor.load"(%13, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<128000xi32>>, index, index) -> tensor<?xi32>
        %23 = "affine.apply"(%20) {map = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>} : (index) -> index
        %24 = "scf.for"(%6, %23, %5, %21) ({
        ^bb0(%arg1: index, %arg2: tensor<?xf32>):
          %26 = "vector.transfer_read"(%22, %arg1, %2) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32) -> vector<4xi32>
          %27 = "arith.index_cast"(%26) : (vector<4xi32>) -> vector<4xindex>
          %28 = "vector.gather"(%15, %6, %27, %3, %4) : (tensor<1000000xf32>, index, vector<4xindex>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
          %29 = "vector.transfer_write"(%28, %arg2, %arg1) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<4xf32>, tensor<?xf32>, index) -> tensor<?xf32>
          "scf.yield"(%29) : (tensor<?xf32>) -> ()
        }) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
        %25 = "scf.for"(%23, %20, %5, %24) ({
        ^bb0(%arg1: index, %arg2: tensor<?xf32>):
          %26 = "affine.apply"(%arg1, %20) {map = affine_map<(d0)[s0] -> (-d0 + s0)>} : (index, index) -> index
          %27 = "tensor.extract_slice"(%22, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xi32>, index, index) -> tensor<?xi32>
          %28 = "tensor.extract_slice"(%arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, index, index) -> tensor<?xf32>
          %29 = "vector.create_mask"(%26) : (index) -> vector<4xi1>
          %30 = "vector.transfer_read"(%27, %6, %2, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32, vector<4xi1>) -> vector<4xi32>
          %31 = "arith.index_cast"(%30) : (vector<4xi32>) -> vector<4xindex>
          %32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
          %33 = "vector.gather"(%15, %6, %32, %0, %1) : (tensor<1000000xf32>, index, vector<-9223372036854775808xindex>, vector<-9223372036854775808xi1>, vector<-9223372036854775808xf32>) -> vector<-9223372036854775808xf32>
          %34 = "vector.transfer_write"(%33, %28, %6, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<-9223372036854775808xf32>, tensor<?xf32>, index, vector<4xi1>) -> tensor<?xf32>
          %35 = "tensor.insert_slice"(%34, %arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, tensor<?xf32>, index, index) -> tensor<?xf32>
          "scf.yield"(%35) : (tensor<?xf32>) -> ()
        }) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
        "flow.dispatch.tensor.store"(%25, %14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, !flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {function_type = () -> (), sym_name = "_train_step_dispatch_94_generic_128000"} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
 <stdin>:1192:13: error: failed to serialize executables
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
 <stdin>:21:12: note: called from
    %8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
           ^
 <stdin>:1192:13: note: see current operation:
 "hal.executable"() ({
  "hal.executable.variant"() ({
    "hal.executable.export"() ({
    ^bb0(%arg0: !hal.device, %arg1: index):
      %0 = "arith.constant"() {value = 32 : index} : () -> index
      %1 = "arith.constant"() {value = 1 : index} : () -> index
      "hal.return"(%0, %1, %1) : (index, index, index) -> ()
    }) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_94_generic_128000", translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} : () -> ()
    "builtin.module"() ({
      "func.func"() ({
        %0 = "arith.constant"() {value = dense<true> : vector<-9223372036854775808xi1>} : () -> vector<-9223372036854775808xi1>
        %1 = "arith.constant"() {value = dense<0.000000e+00> : vector<-9223372036854775808xf32>} : () -> vector<-9223372036854775808xf32>
        %2 = "arith.constant"() {value = 0 : i32} : () -> i32
        %3 = "arith.constant"() {value = dense<true> : vector<4xi1>} : () -> vector<4xi1>
        %4 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32>
        %5 = "arith.constant"() {value = 4 : index} : () -> index
        %6 = "arith.constant"() {value = 0 : index} : () -> index
        %7 = "arith.constant"() {value = 128000 : index} : () -> index
        %8 = "arith.constant"() {value = 12050688 : index} : () -> index
        %9 = "arith.constant"() {value = 10490112 : index} : () -> index
        %10 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
        %11 = "arith.index_castui"(%10) {stream.alignment = 64 : index, stream.values = [64 : index, 8000000 : index]} : (i32) -> index
        %12 = "hal.interface.binding.subspan"(%11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1000000xf32>>
        %13 = "hal.interface.binding.subspan"(%9) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<128000xi32>>
        %14 = "hal.interface.binding.subspan"(%8) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<128000xf32>>
        %15 = "flow.dispatch.tensor.load"(%12) {operand_segment_sizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 1000000>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<1000000xf32>>) -> tensor<1000000xf32>
        %16 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
        %17 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
        %18 = "affine.apply"(%16) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
        %19 = "affine.apply"(%17) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
        "scf.for"(%18, %7, %19) ({
        ^bb0(%arg0: index):
          %20 = "affine.min"(%arg0) {map = affine_map<(d0) -> (-d0 + 128000, 4096)>} : (index) -> index
          %21 = "flow.dispatch.tensor.load"(%14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> tensor<?xf32>
          %22 = "flow.dispatch.tensor.load"(%13, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<128000xi32>>, index, index) -> tensor<?xi32>
          %23 = "affine.apply"(%20) {map = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>} : (index) -> index
          %24 = "scf.for"(%6, %23, %5, %21) ({
          ^bb0(%arg1: index, %arg2: tensor<?xf32>):
            %26 = "vector.transfer_read"(%22, %arg1, %2) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32) -> vector<4xi32>
            %27 = "arith.index_cast"(%26) : (vector<4xi32>) -> vector<4xindex>
            %28 = "vector.gather"(%15, %6, %27, %3, %4) : (tensor<1000000xf32>, index, vector<4xindex>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
            %29 = "vector.transfer_write"(%28, %arg2, %arg1) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<4xf32>, tensor<?xf32>, index) -> tensor<?xf32>
            "scf.yield"(%29) : (tensor<?xf32>) -> ()
          }) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
          %25 = "scf.for"(%23, %20, %5, %24) ({
          ^bb0(%arg1: index, %arg2: tensor<?xf32>):
            %26 = "affine.apply"(%arg1, %20) {map = affine_map<(d0)[s0] -> (-d0 + s0)>} : (index, index) -> index
            %27 = "tensor.extract_slice"(%22, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xi32>, index, index) -> tensor<?xi32>
            %28 = "tensor.extract_slice"(%arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, index, index) -> tensor<?xf32>
            %29 = "vector.create_mask"(%26) : (index) -> vector<4xi1>
            %30 = "vector.transfer_read"(%27, %6, %2, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32, vector<4xi1>) -> vector<4xi32>
            %31 = "arith.index_cast"(%30) : (vector<4xi32>) -> vector<4xindex>
            %32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
            %33 = "vector.gather"(%15, %6, %32, %0, %1) : (tensor<1000000xf32>, index, vector<-9223372036854775808xindex>, vector<-9223372036854775808xi1>, vector<-9223372036854775808xf32>) -> vector<-9223372036854775808xf32>
            %34 = "vector.transfer_write"(%33, %28, %6, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<-9223372036854775808xf32>, tensor<?xf32>, index, vector<4xi1>) -> tensor<?xf32>
            %35 = "tensor.insert_slice"(%34, %arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, tensor<?xf32>, index, index) -> tensor<?xf32>
            "scf.yield"(%35) : (tensor<?xf32>) -> ()
          }) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
          "flow.dispatch.tensor.store"(%25, %14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, !flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "func.return"() : () -> ()
      }) {function_type = () -> (), sym_name = "_train_step_dispatch_94_generic_128000"} : () -> ()
    }) : () -> ()
    "hal.executable.variant_end"() : () -> ()
  }) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
  "hal.executable_end"() : () -> ()
 }) {sym_name = "_train_step_dispatch_94", sym_visibility = "private"} : () -> ()
    %1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
            ^
	22:08 $ python model.py \| iree-compile - --iree-hal-target-backends=llvm-cpu --iree-input-type=mhlo --iree-mhlo-demote-i64-to-i32=false > /tmp/x.vmfb
	<stdin>:1192:13: error: 'vector.broadcast' op dimension mismatch (4 vs. 0)
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1192:13: note: see current operation: %32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^
	<stdin>:1192:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1192:13: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index):
	%0 = "arith.constant"() {value = 32 : index} : () -> index
	%1 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_94_generic_128000", translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = dense<true> : vector<-9223372036854775808xi1>} : () -> vector<-9223372036854775808xi1>
	%1 = "arith.constant"() {value = dense<0.000000e+00> : vector<-9223372036854775808xf32>} : () -> vector<-9223372036854775808xf32>
	%2 = "arith.constant"() {value = 0 : i32} : () -> i32
	%3 = "arith.constant"() {value = dense<true> : vector<4xi1>} : () -> vector<4xi1>
	%4 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32>
	%5 = "arith.constant"() {value = 4 : index} : () -> index
	%6 = "arith.constant"() {value = 0 : index} : () -> index
	%7 = "arith.constant"() {value = 128000 : index} : () -> index
	%8 = "arith.constant"() {value = 12050688 : index} : () -> index
	%9 = "arith.constant"() {value = 10490112 : index} : () -> index
	%10 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
	%11 = "arith.index_castui"(%10) {stream.alignment = 64 : index, stream.values = [64 : index, 8000000 : index]} : (i32) -> index
	%12 = "hal.interface.binding.subspan"(%11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1000000xf32>>
	%13 = "hal.interface.binding.subspan"(%9) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<128000xi32>>
	%14 = "hal.interface.binding.subspan"(%8) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<128000xf32>>
	%15 = "flow.dispatch.tensor.load"(%12) {operand_segment_sizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 1000000>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<1000000xf32>>) -> tensor<1000000xf32>
	%16 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%17 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
	%18 = "affine.apply"(%16) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
	%19 = "affine.apply"(%17) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
	"scf.for"(%18, %7, %19) ({
	^bb0(%arg0: index):
	%20 = "affine.min"(%arg0) {map = affine_map<(d0) -> (-d0 + 128000, 4096)>} : (index) -> index
	%21 = "flow.dispatch.tensor.load"(%14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> tensor<?xf32>
	%22 = "flow.dispatch.tensor.load"(%13, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<128000xi32>>, index, index) -> tensor<?xi32>
	%23 = "affine.apply"(%20) {map = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>} : (index) -> index
	%24 = "scf.for"(%6, %23, %5, %21) ({
	^bb0(%arg1: index, %arg2: tensor<?xf32>):
	%26 = "vector.transfer_read"(%22, %arg1, %2) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32) -> vector<4xi32>
	%27 = "arith.index_cast"(%26) : (vector<4xi32>) -> vector<4xindex>
	%28 = "vector.gather"(%15, %6, %27, %3, %4) : (tensor<1000000xf32>, index, vector<4xindex>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
	%29 = "vector.transfer_write"(%28, %arg2, %arg1) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<4xf32>, tensor<?xf32>, index) -> tensor<?xf32>
	"scf.yield"(%29) : (tensor<?xf32>) -> ()
	}) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
	%25 = "scf.for"(%23, %20, %5, %24) ({
	^bb0(%arg1: index, %arg2: tensor<?xf32>):
	%26 = "affine.apply"(%arg1, %20) {map = affine_map<(d0)[s0] -> (-d0 + s0)>} : (index, index) -> index
	%27 = "tensor.extract_slice"(%22, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xi32>, index, index) -> tensor<?xi32>
	%28 = "tensor.extract_slice"(%arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, index, index) -> tensor<?xf32>
	%29 = "vector.create_mask"(%26) : (index) -> vector<4xi1>
	%30 = "vector.transfer_read"(%27, %6, %2, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32, vector<4xi1>) -> vector<4xi32>
	%31 = "arith.index_cast"(%30) : (vector<4xi32>) -> vector<4xindex>
	%32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
	%33 = "vector.gather"(%15, %6, %32, %0, %1) : (tensor<1000000xf32>, index, vector<-9223372036854775808xindex>, vector<-9223372036854775808xi1>, vector<-9223372036854775808xf32>) -> vector<-9223372036854775808xf32>
	%34 = "vector.transfer_write"(%33, %28, %6, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<-9223372036854775808xf32>, tensor<?xf32>, index, vector<4xi1>) -> tensor<?xf32>
	%35 = "tensor.insert_slice"(%34, %arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, tensor<?xf32>, index, index) -> tensor<?xf32>
	"scf.yield"(%35) : (tensor<?xf32>) -> ()
	}) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
	"flow.dispatch.tensor.store"(%25, %14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, !flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "_train_step_dispatch_94_generic_128000"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^
	<stdin>:1192:13: error: failed to serialize executables
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^
	<stdin>:21:12: note: called from
	%8:5 = call @jit__train_step_kernel$main(%arg0, %arg1, %0, %1, %2, %3, %4, %5, %6, %7) : (tensor<512x19xi32>, tensor<512xi32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<681022xi32>, tensor<11xi32>, tensor<256xi32>, tensor<256xi32>) -> (tensor<1000000xf32>, tensor<1000000xf32>, tensor<1000000xf32>, tensor<2xui32>, tensor<f32>)
	^
	<stdin>:1192:13: note: see current operation:
	"hal.executable"() ({
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index):
	%0 = "arith.constant"() {value = 32 : index} : () -> index
	%1 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "_train_step_dispatch_94_generic_128000", translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = dense<true> : vector<-9223372036854775808xi1>} : () -> vector<-9223372036854775808xi1>
	%1 = "arith.constant"() {value = dense<0.000000e+00> : vector<-9223372036854775808xf32>} : () -> vector<-9223372036854775808xf32>
	%2 = "arith.constant"() {value = 0 : i32} : () -> i32
	%3 = "arith.constant"() {value = dense<true> : vector<4xi1>} : () -> vector<4xi1>
	%4 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32>
	%5 = "arith.constant"() {value = 4 : index} : () -> index
	%6 = "arith.constant"() {value = 0 : index} : () -> index
	%7 = "arith.constant"() {value = 128000 : index} : () -> index
	%8 = "arith.constant"() {value = 12050688 : index} : () -> index
	%9 = "arith.constant"() {value = 10490112 : index} : () -> index
	%10 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
	%11 = "arith.index_castui"(%10) {stream.alignment = 64 : index, stream.values = [64 : index, 8000000 : index]} : (i32) -> index
	%12 = "hal.interface.binding.subspan"(%11) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1000000xf32>>
	%13 = "hal.interface.binding.subspan"(%9) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<128000xi32>>
	%14 = "hal.interface.binding.subspan"(%8) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<128000xf32>>
	%15 = "flow.dispatch.tensor.load"(%12) {operand_segment_sizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 1000000>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<1000000xf32>>) -> tensor<1000000xf32>
	%16 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%17 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
	%18 = "affine.apply"(%16) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
	%19 = "affine.apply"(%17) {map = affine_map<()[s0] -> (s0 * 4096)>} : (index) -> index
	"scf.for"(%18, %7, %19) ({
	^bb0(%arg0: index):
	%20 = "affine.min"(%arg0) {map = affine_map<(d0) -> (-d0 + 128000, 4096)>} : (index) -> index
	%21 = "flow.dispatch.tensor.load"(%14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> tensor<?xf32>
	%22 = "flow.dispatch.tensor.load"(%13, %arg0, %20) {operand_segment_sizes = array<i32: 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (!flow.dispatch.tensor<readonly:tensor<128000xi32>>, index, index) -> tensor<?xi32>
	%23 = "affine.apply"(%20) {map = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>} : (index) -> index
	%24 = "scf.for"(%6, %23, %5, %21) ({
	^bb0(%arg1: index, %arg2: tensor<?xf32>):
	%26 = "vector.transfer_read"(%22, %arg1, %2) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32) -> vector<4xi32>
	%27 = "arith.index_cast"(%26) : (vector<4xi32>) -> vector<4xindex>
	%28 = "vector.gather"(%15, %6, %27, %3, %4) : (tensor<1000000xf32>, index, vector<4xindex>, vector<4xi1>, vector<4xf32>) -> vector<4xf32>
	%29 = "vector.transfer_write"(%28, %arg2, %arg1) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<4xf32>, tensor<?xf32>, index) -> tensor<?xf32>
	"scf.yield"(%29) : (tensor<?xf32>) -> ()
	}) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
	%25 = "scf.for"(%23, %20, %5, %24) ({
	^bb0(%arg1: index, %arg2: tensor<?xf32>):
	%26 = "affine.apply"(%arg1, %20) {map = affine_map<(d0)[s0] -> (-d0 + s0)>} : (index, index) -> index
	%27 = "tensor.extract_slice"(%22, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xi32>, index, index) -> tensor<?xi32>
	%28 = "tensor.extract_slice"(%arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, index, index) -> tensor<?xf32>
	%29 = "vector.create_mask"(%26) : (index) -> vector<4xi1>
	%30 = "vector.transfer_read"(%27, %6, %2, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (tensor<?xi32>, index, i32, vector<4xi1>) -> vector<4xi32>
	%31 = "arith.index_cast"(%30) : (vector<4xi32>) -> vector<4xindex>
	%32 = "vector.broadcast"(%31) : (vector<4xindex>) -> vector<-9223372036854775808xindex>
	%33 = "vector.gather"(%15, %6, %32, %0, %1) : (tensor<1000000xf32>, index, vector<-9223372036854775808xindex>, vector<-9223372036854775808xi1>, vector<-9223372036854775808xf32>) -> vector<-9223372036854775808xf32>
	%34 = "vector.transfer_write"(%33, %28, %6, %29) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 1>, permutation_map = affine_map<(d0) -> (d0)>} : (vector<-9223372036854775808xf32>, tensor<?xf32>, index, vector<4xi1>) -> tensor<?xf32>
	%35 = "tensor.insert_slice"(%34, %arg2, %arg1, %26) {operand_segment_sizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, tensor<?xf32>, index, index) -> tensor<?xf32>
	"scf.yield"(%35) : (tensor<?xf32>) -> ()
	}) : (index, index, index, tensor<?xf32>) -> tensor<?xf32>
	"flow.dispatch.tensor.store"(%25, %14, %arg0, %20) {operand_segment_sizes = array<i32: 1, 1, 0, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: -9223372036854775808>, static_strides = array<i64: 1>} : (tensor<?xf32>, !flow.dispatch.tensor<writeonly:tensor<128000xf32>>, index, index) -> ()
	"scf.yield"() : () -> ()
	}) : (index, index, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "_train_step_dispatch_94_generic_128000"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "embedded_elf_arm_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>} : () -> ()
	"hal.executable_end"() : () -> ()
	}) {sym_name = "_train_step_dispatch_94", sym_visibility = "private"} : () -> ()
	%1155 = "stablehlo.gather"(%arg2, %1154) {dimension_numbers = #stablehlo.gather<collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<1> : tensor<1xi64>} : (tensor<1000000xf32>, tensor<500x256x1xi32>) -> tensor<500x256xf32>
	^