AmosLewis · August 8, 2024 22:49
diff --git a/dpn68_vaiq_iree_failed.log b/dpn68_vaiq_iree_failed.log
 failed to translate executables
 failed to translate executables
 dpn68_vaiq.default.onnx.linalg.mlir:1243:12: error: One or more operations with large vector sizes (8192 bytes) were found:

    %180 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%179 : tensor<1x64x56x56xf32>) outs(%107 : tensor<1x64x56x56xi8>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:9:3: note: called from
  func.func @main_graph(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 <unknown>:0: note:   %cst_3 = arith.constant dense<1.562500e-02> : vector<200704xf32>

 dpn68_vaiq.default.onnx.linalg.mlir:1014:12: note:   %5 = vector.transfer_read %3[%c0], %c0_i8 {in_bounds = [true]} : tensor<200704xi8>, vector<200704xi8>

    %144 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%141 : tensor<1x64x56x56xi8>) outs(%115 : tensor<1x64x56x56xf32>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:1016:15: note:   %6 = arith.extsi %5 : vector<200704xi8> to vector<200704xi32>

      %1072 = arith.extsi %in : i8 to i32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1017:15: note:   %7 = arith.sitofp %6 : vector<200704xi32> to vector<200704xf32>

      %1073 = arith.sitofp %1072 : i32 to f32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1018:15: note:   %8 = arith.mulf %7, %cst_3 : vector<200704xf32>

      %1074 = arith.mulf %1073, %cst_31 : f32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1018:15: note:   %9 = vector.transfer_write %8, %4[%c0] {in_bounds = [true]} : vector<200704xf32>, tensor<200704xf32>

 dpn68_vaiq.default.onnx.linalg.mlir:1243:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %180 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%179 : tensor<1x64x56x56xf32>) outs(%107 : tensor<1x64x56x56xi8>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:9:3: note: called from
  func.func @main_graph(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 dpn68_vaiq.default.onnx.linalg.mlir:1243:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg8: !hal.device):
    %62 = "arith.constant"() <{value = 2 : index}> : () -> index
    %63 = "arith.constant"() <{value = 8 : index}> : () -> index
    %64 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%62, %63, %64) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "main_graph_dispatch_34_elementwise_64x56x56_f32xf32xi8"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_34_elementwise_64x56x56_f32xf32xi8"}> ({
      %0 = "arith.constant"() <{value = dense<1.270000e+02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %1 = "arith.constant"() <{value = dense<-1.280000e+02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %2 = "arith.constant"() <{value = dense<0.000000e+00> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %3 = "arith.constant"() <{value = dense<1.562500e-02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %4 = "arith.constant"() <{value = dense<1.562500e-02> : vector<200704xf32>}> : () -> vector<200704xf32>
      %5 = "arith.constant"() <{value = 0 : i8}> : () -> i8
      %6 = "arith.constant"() <{value = 4 : index}> : () -> index
      %7 = "arith.constant"() <{value = 1 : index}> : () -> index
      %8 = "arith.constant"() <{value = 28 : index}> : () -> index
      %9 = "arith.constant"() <{value = 8 : index}> : () -> index
      %10 = "arith.constant"() <{value = 0 : index}> : () -> index
      %11 = "arith.constant"() <{value = 64 : index}> : () -> index
      %12 = "arith.constant"() <{value = 56 : index}> : () -> index
      %13 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %14 = "arith.constant"() <{value = 2408448 : index}> : () -> index
      %15 = "arith.constant"() <{value = 2207744 : index}> : () -> index
      %16 = "arith.constant"() <{value = 802816 : index}> : () -> index
      %17 = "hal.interface.binding.subspan"(%14) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x80x56x56xf32>>
      %18 = "hal.interface.binding.subspan"(%15) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<200704xi8>>
      %19 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 1 : index, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<64x56x56xi8>>
      %20 = "flow.dispatch.tensor.load"(%18) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 200704>, static_strides = array<i64: 1>}> : (!flow.dispatch.tensor<readonly:tensor<200704xi8>>) -> tensor<200704xi8>
      %21 = "tensor.empty"() : () -> tensor<200704xf32>
      %22 = "vector.transfer_read"(%20, %10, %5) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (tensor<200704xi8>, index, i8) -> vector<200704xi8>
      %23 = "arith.extsi"(%22) : (vector<200704xi8>) -> vector<200704xi32>
      %24 = "arith.sitofp"(%23) : (vector<200704xi32>) -> vector<200704xf32>
      %25 = "arith.mulf"(%24, %4) <{fastmath = #arith.fastmath<none>}> : (vector<200704xf32>, vector<200704xf32>) -> vector<200704xf32>
      %26 = "vector.transfer_write"(%25, %21, %10) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (vector<200704xf32>, tensor<200704xf32>, index) -> tensor<200704xf32>
      %27 = "tensor.expand_shape"(%26) <{reassociation = [[0, 1, 2]], static_output_shape = array<i64: 64, 56, 56>}> : (tensor<200704xf32>) -> tensor<64x56x56xf32>
      %28 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %29 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %30 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %31 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %32 = "affine.apply"(%30) <{map = affine_map<()[s0] -> (s0 * 8)>}> : (index) -> index
      %33 = "affine.apply"(%31) <{map = affine_map<()[s0] -> (s0 * 8)>}> : (index) -> index
      %34 = "affine.apply"(%28) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      %35 = "affine.apply"(%29) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      "scf.for"(%32, %11, %33) ({
      ^bb0(%arg0: index):
        "scf.for"(%34, %12, %35) ({
        ^bb0(%arg1: index):
          %36 = "flow.dispatch.tensor.load"(%19, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 0, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 8, 28, 56>, static_strides = array<i64: 1, 1, 1>}> : (!flow.dispatch.tensor<writeonly:tensor<64x56x56xi8>>, index, index) -> tensor<8x28x56xi8>
          %37 = "flow.dispatch.tensor.load"(%17, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 0, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 1, 8, 28, 56>, static_strides = array<i64: 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x80x56x56xf32>>, index, index) -> tensor<8x28x56xf32>
          %38 = "scf.for"(%10, %9, %7, %36) ({
          ^bb0(%arg2: index, %arg3: tensor<8x28x56xi8>):
            %39 = "scf.for"(%10, %8, %7, %arg3) ({
            ^bb0(%arg4: index, %arg5: tensor<8x28x56xi8>):
              %40 = "scf.for"(%10, %12, %6, %arg5) ({
              ^bb0(%arg6: index, %arg7: tensor<8x28x56xi8>):
                %41 = "arith.addi"(%arg2, %arg0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
                %42 = "arith.addi"(%arg4, %arg1) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
                %43 = "vector.transfer_read"(%27, %41, %42, %arg6, %13) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<64x56x56xf32>, index, index, index, f32) -> vector<1x1x4xf32>
                %44 = "vector.transfer_read"(%37, %arg2, %arg4, %arg6, %13) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<8x28x56xf32>, index, index, index, f32) -> vector<1x1x4xf32>
                %45 = "arith.divf"(%44, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %46 = "math.roundeven"(%45) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %47 = "arith.addf"(%46, %2) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %48 = "arith.maximumf"(%47, %1) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %49 = "arith.minimumf"(%48, %0) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %50 = "arith.fptosi"(%49) : (vector<1x1x4xf32>) -> vector<1x1x4xi8>
                %51 = "arith.extsi"(%50) : (vector<1x1x4xi8>) -> vector<1x1x4xi32>
                %52 = "arith.sitofp"(%51) : (vector<1x1x4xi32>) -> vector<1x1x4xf32>
                %53 = "arith.mulf"(%52, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %54 = "arith.addf"(%43, %53) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %55 = "arith.divf"(%54, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %56 = "math.roundeven"(%55) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %57 = "arith.addf"(%56, %2) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %58 = "arith.maximumf"(%57, %1) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %59 = "arith.minimumf"(%58, %0) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %60 = "arith.fptosi"(%59) : (vector<1x1x4xf32>) -> vector<1x1x4xi8>
                %61 = "vector.transfer_write"(%60, %arg7, %arg2, %arg4, %arg6) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<1x1x4xi8>, tensor<8x28x56xi8>, index, index, index) -> tensor<8x28x56xi8>
                "scf.yield"(%61) : (tensor<8x28x56xi8>) -> ()
              }) : (index, index, index, tensor<8x28x56xi8>) -> tensor<8x28x56xi8>
              "scf.yield"(%40) : (tensor<8x28x56xi8>) -> ()
            }) : (index, index, index, tensor<8x28x56xi8>) -> tensor<8x28x56xi8>
            "scf.yield"(%39) : (tensor<8x28x56xi8>) -> ()
          }) : (index, index, index, tensor<8x28x56xi8>) -> tensor<8x28x56xi8>
          "flow.dispatch.tensor.store"(%38, %19, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 8, 28, 56>, static_strides = array<i64: 1, 1, 1>}> : (tensor<8x28x56xi8>, !flow.dispatch.tensor<writeonly:tensor<64x56x56xi8>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %180 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%179 : tensor<1x64x56x56xf32>) outs(%107 : tensor<1x64x56x56xi8>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:1497:12: error: One or more operations with large vector sizes (8192 bytes) were found:

    %215 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%214 : tensor<1x64x56x56xi8>) outs(%115 : tensor<1x64x56x56xf32>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:9:3: note: called from
  func.func @main_graph(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 <unknown>:0: note:   %cst_3 = arith.constant dense<1.562500e-02> : vector<200704xf32>

 dpn68_vaiq.default.onnx.linalg.mlir:1263:12: note:   %5 = vector.transfer_read %3[%c0], %c0_i8 {in_bounds = [true]} : tensor<200704xi8>, vector<200704xi8>

    %182 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%180 : tensor<1x64x56x56xi8>) outs(%115 : tensor<1x64x56x56xf32>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:1265:15: note:   %6 = arith.extsi %5 : vector<200704xi8> to vector<200704xi32>

      %1072 = arith.extsi %in : i8 to i32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1266:15: note:   %7 = arith.sitofp %6 : vector<200704xi32> to vector<200704xf32>

      %1073 = arith.sitofp %1072 : i32 to f32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1267:15: note:   %8 = arith.mulf %7, %cst_3 : vector<200704xf32>

      %1074 = arith.mulf %1073, %cst_31 : f32
              ^
 dpn68_vaiq.default.onnx.linalg.mlir:1267:15: note:   %9 = vector.transfer_write %8, %4[%c0] {in_bounds = [true]} : vector<200704xf32>, tensor<200704xf32>

 dpn68_vaiq.default.onnx.linalg.mlir:1497:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %215 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%214 : tensor<1x64x56x56xi8>) outs(%115 : tensor<1x64x56x56xf32>) {
           ^
 dpn68_vaiq.default.onnx.linalg.mlir:9:3: note: called from
  func.func @main_graph(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 dpn68_vaiq.default.onnx.linalg.mlir:1497:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg8: !hal.device):
    %65 = "arith.constant"() <{value = 2 : index}> : () -> index
    %66 = "arith.constant"() <{value = 8 : index}> : () -> index
    %67 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%65, %66, %67) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "main_graph_dispatch_47_elementwise_64x56x56_f32"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "main_graph_dispatch_47_elementwise_64x56x56_f32"}> ({
      %0 = "arith.constant"() <{value = dense<1.270000e+02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %1 = "arith.constant"() <{value = dense<-1.280000e+02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %2 = "arith.constant"() <{value = dense<0.000000e+00> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %3 = "arith.constant"() <{value = dense<1.562500e-02> : vector<1x1x4xf32>}> : () -> vector<1x1x4xf32>
      %4 = "arith.constant"() <{value = dense<1.562500e-02> : vector<200704xf32>}> : () -> vector<200704xf32>
      %5 = "arith.constant"() <{value = 0 : i8}> : () -> i8
      %6 = "arith.constant"() <{value = 4 : index}> : () -> index
      %7 = "arith.constant"() <{value = 1 : index}> : () -> index
      %8 = "arith.constant"() <{value = 28 : index}> : () -> index
      %9 = "arith.constant"() <{value = 8 : index}> : () -> index
      %10 = "arith.constant"() <{value = 0 : index}> : () -> index
      %11 = "arith.constant"() <{value = 64 : index}> : () -> index
      %12 = "arith.constant"() <{value = 56 : index}> : () -> index
      %13 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %14 = "arith.constant"() <{value = 2007040 : index}> : () -> index
      %15 = "arith.constant"() <{value = 802816 : index}> : () -> index
      %16 = "arith.constant"() <{value = 1003520 : index}> : () -> index
      %17 = "hal.interface.binding.subspan"(%14) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x80x56x56xf32>>
      %18 = "hal.interface.binding.subspan"(%15) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<readonly:tensor<200704xi8>>
      %19 = "hal.interface.binding.subspan"(%16) {alignment = 64 : index, binding = 1 : index, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> !flow.dispatch.tensor<writeonly:tensor<64x56x56xf32>>
      %20 = "flow.dispatch.tensor.load"(%18) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 200704>, static_strides = array<i64: 1>}> : (!flow.dispatch.tensor<readonly:tensor<200704xi8>>) -> tensor<200704xi8>
      %21 = "tensor.empty"() : () -> tensor<200704xf32>
      %22 = "vector.transfer_read"(%20, %10, %5) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (tensor<200704xi8>, index, i8) -> vector<200704xi8>
      %23 = "arith.extsi"(%22) : (vector<200704xi8>) -> vector<200704xi32>
      %24 = "arith.sitofp"(%23) : (vector<200704xi32>) -> vector<200704xf32>
      %25 = "arith.mulf"(%24, %4) <{fastmath = #arith.fastmath<none>}> : (vector<200704xf32>, vector<200704xf32>) -> vector<200704xf32>
      %26 = "vector.transfer_write"(%25, %21, %10) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (vector<200704xf32>, tensor<200704xf32>, index) -> tensor<200704xf32>
      %27 = "tensor.expand_shape"(%26) <{reassociation = [[0, 1, 2]], static_output_shape = array<i64: 64, 56, 56>}> : (tensor<200704xf32>) -> tensor<64x56x56xf32>
      %28 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %29 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %30 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %31 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %32 = "affine.apply"(%30) <{map = affine_map<()[s0] -> (s0 * 8)>}> : (index) -> index
      %33 = "affine.apply"(%31) <{map = affine_map<()[s0] -> (s0 * 8)>}> : (index) -> index
      %34 = "affine.apply"(%28) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      %35 = "affine.apply"(%29) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      "scf.for"(%32, %11, %33) ({
      ^bb0(%arg0: index):
        "scf.for"(%34, %12, %35) ({
        ^bb0(%arg1: index):
          %36 = "flow.dispatch.tensor.load"(%19, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 0, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 8, 28, 56>, static_strides = array<i64: 1, 1, 1>}> : (!flow.dispatch.tensor<writeonly:tensor<64x56x56xf32>>, index, index) -> tensor<8x28x56xf32>
          %37 = "flow.dispatch.tensor.load"(%17, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 0, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 1, 8, 28, 56>, static_strides = array<i64: 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x80x56x56xf32>>, index, index) -> tensor<8x28x56xf32>
          %38 = "scf.for"(%10, %9, %7, %36) ({
          ^bb0(%arg2: index, %arg3: tensor<8x28x56xf32>):
            %39 = "scf.for"(%10, %8, %7, %arg3) ({
            ^bb0(%arg4: index, %arg5: tensor<8x28x56xf32>):
              %40 = "scf.for"(%10, %12, %6, %arg5) ({
              ^bb0(%arg6: index, %arg7: tensor<8x28x56xf32>):
                %41 = "arith.addi"(%arg2, %arg0) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
                %42 = "arith.addi"(%arg4, %arg1) <{overflowFlags = #arith.overflow<none>}> : (index, index) -> index
                %43 = "vector.transfer_read"(%27, %41, %42, %arg6, %13) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<64x56x56xf32>, index, index, index, f32) -> vector<1x1x4xf32>
                %44 = "vector.transfer_read"(%37, %arg2, %arg4, %arg6, %13) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<8x28x56xf32>, index, index, index, f32) -> vector<1x1x4xf32>
                %45 = "arith.divf"(%44, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %46 = "math.roundeven"(%45) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %47 = "arith.addf"(%46, %2) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %48 = "arith.maximumf"(%47, %1) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %49 = "arith.minimumf"(%48, %0) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %50 = "arith.fptosi"(%49) : (vector<1x1x4xf32>) -> vector<1x1x4xi8>
                %51 = "arith.extsi"(%50) : (vector<1x1x4xi8>) -> vector<1x1x4xi32>
                %52 = "arith.sitofp"(%51) : (vector<1x1x4xi32>) -> vector<1x1x4xf32>
                %53 = "arith.mulf"(%52, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %54 = "arith.addf"(%43, %53) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %55 = "arith.divf"(%54, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %56 = "math.roundeven"(%55) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %57 = "arith.addf"(%56, %2) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %58 = "arith.maximumf"(%57, %1) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %59 = "arith.minimumf"(%58, %0) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %60 = "arith.fptosi"(%59) : (vector<1x1x4xf32>) -> vector<1x1x4xi8>
                %61 = "arith.extsi"(%60) : (vector<1x1x4xi8>) -> vector<1x1x4xi32>
                %62 = "arith.sitofp"(%61) : (vector<1x1x4xi32>) -> vector<1x1x4xf32>
                %63 = "arith.mulf"(%62, %3) <{fastmath = #arith.fastmath<none>}> : (vector<1x1x4xf32>, vector<1x1x4xf32>) -> vector<1x1x4xf32>
                %64 = "vector.transfer_write"(%63, %arg7, %arg2, %arg4, %arg6) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<1x1x4xf32>, tensor<8x28x56xf32>, index, index, index) -> tensor<8x28x56xf32>
                "scf.yield"(%64) : (tensor<8x28x56xf32>) -> ()
              }) : (index, index, index, tensor<8x28x56xf32>) -> tensor<8x28x56xf32>
              "scf.yield"(%40) : (tensor<8x28x56xf32>) -> ()
            }) : (index, index, index, tensor<8x28x56xf32>) -> tensor<8x28x56xf32>
            "scf.yield"(%39) : (tensor<8x28x56xf32>) -> ()
          }) : (index, index, index, tensor<8x28x56xf32>) -> tensor<8x28x56xf32>
          "flow.dispatch.tensor.store"(%38, %19, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, -9223372036854775808, 0>, static_sizes = array<i64: 8, 28, 56>, static_strides = array<i64: 1, 1, 1>}> : (tensor<8x28x56xf32>, !flow.dispatch.tensor<writeonly:tensor<64x56x56xf32>>, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %215 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%214 : tensor<1x64x56x56xi8>) outs(%115 : tensor<1x64x56x56xf32>) {
           ^