AmosLewis · June 12, 2024 03:35
diff --git a/ConvNext_vaiq_int8.iree-compile.log b/ConvNext_vaiq_int8.iree-compile.log
 failed to translate executables
 failed to translate executables
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:979:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 401408 bytes
    %106 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:24:3: note: called from
  func.func @torch_jit(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:979:12: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "torch_jit_dispatch_13_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"}> ({
  %0 = "arith.constant"() <{value = 3.750000e+00 : f32}> : () -> f32
  %1 = "arith.constant"() <{value = 2.000000e+00 : f32}> : () -> f32
  %2 = "arith.constant"() <{value = 8.000000e-01 : f32}> : () -> f32
  %3 = "arith.constant"() <{value = 0.0821908935 : f32}> : () -> f32
  %4 = "arith.constant"() <{value = -0.583389878 : f32}> : () -> f32
  %5 = "arith.constant"() <{value = 1.62705934 : f32}> : () -> f32
  %6 = "arith.constant"() <{value = -2.0606916 : f32}> : () -> f32
  %7 = "arith.constant"() <{value = 0.0572442785 : f32}> : () -> f32
  %8 = "arith.constant"() <{value = -0.0883462652 : f32}> : () -> f32
  %9 = "arith.constant"() <{value = 0.448369086 : f32}> : () -> f32
  %10 = "arith.constant"() <{value = -3.276070e-01 : f32}> : () -> f32
  %11 = "arith.constant"() <{value = 0.0739796459 : f32}> : () -> f32
  %12 = "arith.constant"() <{value = -0.131808966 : f32}> : () -> f32
  %13 = "arith.constant"() <{value = 0.519230127 : f32}> : () -> f32
  %14 = "arith.constant"() <{value = -0.463513821 : f32}> : () -> f32
  %15 = "arith.constant"() <{value = -1.71048032E-5 : f32}> : () -> f32
  %16 = "arith.constant"() <{value = 2.53447099E-4 : f32}> : () -> f32
  %17 = "arith.constant"() <{value = -0.00141373626 : f32}> : () -> f32
  %18 = "arith.constant"() <{value = 0.00351961935 : f32}> : () -> f32
  %19 = "arith.constant"() <{value = -0.00330093061 : f32}> : () -> f32
  %20 = "arith.constant"() <{value = 0.0370645523 : f32}> : () -> f32
  %21 = "arith.constant"() <{value = 0.118407398 : f32}> : () -> f32
  %22 = "arith.constant"() <{value = -0.364721417 : f32}> : () -> f32
  %23 = "arith.constant"() <{value = 1.12750685 : f32}> : () -> f32
  %24 = "arith.constant"() <{value = 0.0258146804 : f32}> : () -> f32
  %25 = "arith.constant"() <{value = 0.209741712 : f32}> : () -> f32
  %26 = "arith.constant"() <{value = -0.523018539 : f32}> : () -> f32
  %27 = "arith.constant"() <{value = 1.12837911 : f32}> : () -> f32
  %28 = "arith.constant"() <{value = 128 : index}> : () -> index
  %29 = "arith.constant"() <{value = 64 : index}> : () -> index
  %30 = "arith.constant"() <{value = 1 : index}> : () -> index
  %31 = "arith.constant"() <{value = 28 : index}> : () -> index
  %32 = "arith.constant"() <{value = 0 : index}> : () -> index
  %33 = "arith.constant"() <{value = 86213696 : index}> : () -> index
  %34 = "arith.constant"() <{value = 2420992 : index}> : () -> index
  %35 = "arith.constant"() <{value = 2019584 : index}> : () -> index
  %36 = "arith.constant"() <{value = 5.000000e-01 : f32}> : () -> f32
  %37 = "arith.constant"() <{value = 1.000000e+00 : f32}> : () -> f32
  %38 = "arith.constant"() <{value = 1.41421354 : f32}> : () -> f32
  %39 = "arith.constant"() <{value = 1.270000e+02 : f32}> : () -> f32
  %40 = "arith.constant"() <{value = -1.280000e+02 : f32}> : () -> f32
  %41 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
  %42 = "arith.constant"() <{value = 1.250000e-01 : f32}> : () -> f32
  %43 = "arith.constant"() <{value = 9.765625E-4 : f32}> : () -> f32
  %44 = "arith.constant"() <{value = 0 : i32}> : () -> i32
  %45 = "arith.constant"() <{value = 56 : index}> : () -> index
  %46 = "arith.constant"() <{value = 512 : index}> : () -> index
  %47 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<28x56x64xi32>
  %48 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>
  "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>) -> ()
  %49 = "hal.interface.binding.subspan"(%34) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>
  "memref.assume_alignment"(%49) <{alignment = 64 : i32}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>) -> ()
  %50 = "hal.interface.binding.subspan"(%33) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512xf32, strided<[1], offset: 21553424>>
  "memref.assume_alignment"(%50) <{alignment = 64 : i32}> : (memref<512xf32, strided<[1], offset: 21553424>>) -> ()
  %51 = "hal.interface.binding.subspan"(%32) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x512xi8>
  "memref.assume_alignment"(%51) <{alignment = 64 : i32}> : (memref<56x56x512xi8>) -> ()
  %52 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
  %53 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
  %54 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
  %55 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
  %56 = "affine.apply"(%54) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
  %57 = "affine.apply"(%55) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
  "scf.for"(%56, %45, %57) ({
  ^bb0(%arg0: index):
    %58 = "affine.apply"(%52) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
    %59 = "affine.apply"(%53) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
    "scf.for"(%58, %46, %59) ({
    ^bb0(%arg1: index):
      %60 = "memref.subview"(%51, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 56, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x512xi8>, index, index) -> memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>
      %61 = "memref.subview"(%50, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 64>, static_strides = array<i64: 1>}> : (memref<512xf32, strided<[1], offset: 21553424>>, index) -> memref<64xf32, strided<[1], offset: ?>>
      %62 = "memref.subview"(%48, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0>, static_sizes = array<i64: 28, 56, 128>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>, index) -> memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>
      %63 = "memref.subview"(%49, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 128, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>, index, index) -> memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg9: index):
        "scf.for"(%32, %45, %30) ({
        ^bb0(%arg10: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg11: index):
            "memref.store"(%44, %47, %arg9, %arg10, %arg11) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg5: index):
        "scf.for"(%32, %45, %30) ({
        ^bb0(%arg6: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg7: index):
            "scf.for"(%32, %28, %30) ({
            ^bb0(%arg8: index):
              %129 = "memref.load"(%62, %arg5, %arg6, %arg8) <{nontemporal = false}> : (memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>, index, index, index) -> i8
              %130 = "memref.load"(%63, %arg5, %arg8, %arg7) <{nontemporal = false}> : (memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>, index, index, index) -> i8
              %131 = "memref.load"(%47, %arg5, %arg6, %arg7) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
              %132 = "arith.extsi"(%129) : (i8) -> i32
              %133 = "arith.extsi"(%130) : (i8) -> i32
              %134 = "arith.muli"(%132, %133) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
              %135 = "arith.addi"(%131, %134) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
              "memref.store"(%135, %47, %arg5, %arg6, %arg7) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg2: index):
        "scf.for"(%32, %45, %30) ({
        ^bb0(%arg3: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg4: index):
            %64 = "memref.load"(%61, %arg4) <{nontemporal = false}> : (memref<64xf32, strided<[1], offset: ?>>, index) -> f32
            %65 = "memref.load"(%47, %arg2, %arg3, %arg4) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
            %66 = "arith.sitofp"(%65) : (i32) -> f32
            %67 = "arith.mulf"(%66, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %68 = "arith.addf"(%64, %67) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %69 = "arith.divf"(%68, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %70 = "math.round"(%69) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %71 = "arith.addf"(%70, %41) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %72 = "arith.cmpf"(%71, %40) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %73 = "arith.cmpf"(%71, %39) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
            %74 = "arith.select"(%72, %40, %71) : (i1, f32, f32) -> f32
            %75 = "arith.select"(%73, %39, %74) : (i1, f32, f32) -> f32
            %76 = "arith.fptosi"(%75) : (f32) -> i8
            %77 = "arith.extsi"(%76) : (i8) -> i32
            %78 = "arith.sitofp"(%77) : (i32) -> f32
            %79 = "arith.mulf"(%78, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %80 = "arith.divf"(%79, %38) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %81 = "arith.cmpf"(%80, %41) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %82 = "arith.negf"(%80) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %83 = "arith.select"(%81, %82, %80) : (i1, f32, f32) -> f32
            %84 = "arith.cmpf"(%83, %2) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %85 = "arith.select"(%84, %27, %23) : (i1, f32, f32) -> f32
            %86 = "arith.select"(%84, %14, %10) : (i1, f32, f32) -> f32
            %87 = "arith.select"(%84, %26, %22) : (i1, f32, f32) -> f32
            %88 = "arith.select"(%84, %13, %9) : (i1, f32, f32) -> f32
            %89 = "arith.select"(%84, %25, %21) : (i1, f32, f32) -> f32
            %90 = "arith.select"(%84, %12, %8) : (i1, f32, f32) -> f32
            %91 = "arith.select"(%84, %24, %20) : (i1, f32, f32) -> f32
            %92 = "arith.select"(%84, %11, %7) : (i1, f32, f32) -> f32
            %93 = "arith.cmpf"(%83, %1) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %94 = "arith.select"(%93, %41, %19) : (i1, f32, f32) -> f32
            %95 = "arith.select"(%93, %85, %18) : (i1, f32, f32) -> f32
            %96 = "arith.select"(%93, %86, %6) : (i1, f32, f32) -> f32
            %97 = "arith.select"(%93, %87, %17) : (i1, f32, f32) -> f32
            %98 = "arith.select"(%93, %88, %5) : (i1, f32, f32) -> f32
            %99 = "arith.select"(%93, %89, %16) : (i1, f32, f32) -> f32
            %100 = "arith.select"(%93, %90, %4) : (i1, f32, f32) -> f32
            %101 = "arith.select"(%93, %91, %15) : (i1, f32, f32) -> f32
            %102 = "arith.select"(%93, %92, %3) : (i1, f32, f32) -> f32
            %103 = "arith.select"(%93, %41, %37) : (i1, f32, f32) -> f32
            %104 = "arith.cmpf"(%83, %0) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %105 = "math.fma"(%83, %101, %99) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %106 = "math.fma"(%83, %105, %97) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %107 = "math.fma"(%83, %106, %95) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %108 = "math.fma"(%83, %107, %94) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %109 = "math.fma"(%83, %102, %100) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %110 = "math.fma"(%83, %109, %98) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %111 = "math.fma"(%83, %110, %96) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %112 = "math.fma"(%83, %111, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %113 = "arith.divf"(%108, %112) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %114 = "arith.addf"(%103, %113) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %115 = "arith.select"(%104, %114, %37) : (i1, f32, f32) -> f32
            %116 = "arith.negf"(%115) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %117 = "arith.select"(%81, %116, %115) : (i1, f32, f32) -> f32
            %118 = "arith.addf"(%117, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %119 = "arith.mulf"(%79, %118) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %120 = "arith.mulf"(%119, %36) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %121 = "arith.divf"(%120, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %122 = "math.round"(%121) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %123 = "arith.addf"(%122, %41) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %124 = "arith.cmpf"(%123, %40) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %125 = "arith.cmpf"(%123, %39) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
            %126 = "arith.select"(%124, %40, %123) : (i1, f32, f32) -> f32
            %127 = "arith.select"(%125, %39, %126) : (i1, f32, f32) -> f32
            %128 = "arith.fptosi"(%127) : (f32) -> i8
            "memref.store"(%128, %60, %arg2, %arg3, %arg4) <{nontemporal = false}> : (i8, memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
    %106 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:979:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %106 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:24:3: note: called from
  func.func @torch_jit(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:979:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg12: !hal.device):
    %136 = "arith.constant"() <{value = 8 : index}> : () -> index
    %137 = "arith.constant"() <{value = 2 : index}> : () -> index
    %138 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%136, %137, %138) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "torch_jit_dispatch_13_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "torch_jit_dispatch_13_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"}> ({
      %0 = "arith.constant"() <{value = 3.750000e+00 : f32}> : () -> f32
      %1 = "arith.constant"() <{value = 2.000000e+00 : f32}> : () -> f32
      %2 = "arith.constant"() <{value = 8.000000e-01 : f32}> : () -> f32
      %3 = "arith.constant"() <{value = 0.0821908935 : f32}> : () -> f32
      %4 = "arith.constant"() <{value = -0.583389878 : f32}> : () -> f32
      %5 = "arith.constant"() <{value = 1.62705934 : f32}> : () -> f32
      %6 = "arith.constant"() <{value = -2.0606916 : f32}> : () -> f32
      %7 = "arith.constant"() <{value = 0.0572442785 : f32}> : () -> f32
      %8 = "arith.constant"() <{value = -0.0883462652 : f32}> : () -> f32
      %9 = "arith.constant"() <{value = 0.448369086 : f32}> : () -> f32
      %10 = "arith.constant"() <{value = -3.276070e-01 : f32}> : () -> f32
      %11 = "arith.constant"() <{value = 0.0739796459 : f32}> : () -> f32
      %12 = "arith.constant"() <{value = -0.131808966 : f32}> : () -> f32
      %13 = "arith.constant"() <{value = 0.519230127 : f32}> : () -> f32
      %14 = "arith.constant"() <{value = -0.463513821 : f32}> : () -> f32
      %15 = "arith.constant"() <{value = -1.71048032E-5 : f32}> : () -> f32
      %16 = "arith.constant"() <{value = 2.53447099E-4 : f32}> : () -> f32
      %17 = "arith.constant"() <{value = -0.00141373626 : f32}> : () -> f32
      %18 = "arith.constant"() <{value = 0.00351961935 : f32}> : () -> f32
      %19 = "arith.constant"() <{value = -0.00330093061 : f32}> : () -> f32
      %20 = "arith.constant"() <{value = 0.0370645523 : f32}> : () -> f32
      %21 = "arith.constant"() <{value = 0.118407398 : f32}> : () -> f32
      %22 = "arith.constant"() <{value = -0.364721417 : f32}> : () -> f32
      %23 = "arith.constant"() <{value = 1.12750685 : f32}> : () -> f32
      %24 = "arith.constant"() <{value = 0.0258146804 : f32}> : () -> f32
      %25 = "arith.constant"() <{value = 0.209741712 : f32}> : () -> f32
      %26 = "arith.constant"() <{value = -0.523018539 : f32}> : () -> f32
      %27 = "arith.constant"() <{value = 1.12837911 : f32}> : () -> f32
      %28 = "arith.constant"() <{value = 128 : index}> : () -> index
      %29 = "arith.constant"() <{value = 64 : index}> : () -> index
      %30 = "arith.constant"() <{value = 1 : index}> : () -> index
      %31 = "arith.constant"() <{value = 28 : index}> : () -> index
      %32 = "arith.constant"() <{value = 0 : index}> : () -> index
      %33 = "arith.constant"() <{value = 86213696 : index}> : () -> index
      %34 = "arith.constant"() <{value = 2420992 : index}> : () -> index
      %35 = "arith.constant"() <{value = 2019584 : index}> : () -> index
      %36 = "arith.constant"() <{value = 5.000000e-01 : f32}> : () -> f32
      %37 = "arith.constant"() <{value = 1.000000e+00 : f32}> : () -> f32
      %38 = "arith.constant"() <{value = 1.41421354 : f32}> : () -> f32
      %39 = "arith.constant"() <{value = 1.270000e+02 : f32}> : () -> f32
      %40 = "arith.constant"() <{value = -1.280000e+02 : f32}> : () -> f32
      %41 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %42 = "arith.constant"() <{value = 1.250000e-01 : f32}> : () -> f32
      %43 = "arith.constant"() <{value = 9.765625E-4 : f32}> : () -> f32
      %44 = "arith.constant"() <{value = 0 : i32}> : () -> i32
      %45 = "arith.constant"() <{value = 56 : index}> : () -> index
      %46 = "arith.constant"() <{value = 512 : index}> : () -> index
      %47 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<28x56x64xi32>
      %48 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>
      "memref.assume_alignment"(%48) <{alignment = 64 : i32}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>) -> ()
      %49 = "hal.interface.binding.subspan"(%34) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>
      "memref.assume_alignment"(%49) <{alignment = 64 : i32}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>) -> ()
      %50 = "hal.interface.binding.subspan"(%33) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512xf32, strided<[1], offset: 21553424>>
      "memref.assume_alignment"(%50) <{alignment = 64 : i32}> : (memref<512xf32, strided<[1], offset: 21553424>>) -> ()
      %51 = "hal.interface.binding.subspan"(%32) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x512xi8>
      "memref.assume_alignment"(%51) <{alignment = 64 : i32}> : (memref<56x56x512xi8>) -> ()
      %52 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %53 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %54 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %55 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %56 = "affine.apply"(%54) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      %57 = "affine.apply"(%55) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      "scf.for"(%56, %45, %57) ({
      ^bb0(%arg0: index):
        %58 = "affine.apply"(%52) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
        %59 = "affine.apply"(%53) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
        "scf.for"(%58, %46, %59) ({
        ^bb0(%arg1: index):
          %60 = "memref.subview"(%51, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 56, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x512xi8>, index, index) -> memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>
          %61 = "memref.subview"(%50, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 64>, static_strides = array<i64: 1>}> : (memref<512xf32, strided<[1], offset: 21553424>>, index) -> memref<64xf32, strided<[1], offset: ?>>
          %62 = "memref.subview"(%48, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0>, static_sizes = array<i64: 28, 56, 128>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 2019584>>, index) -> memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>
          %63 = "memref.subview"(%49, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 128, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2420992>>, index, index) -> memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg9: index):
            "scf.for"(%32, %45, %30) ({
            ^bb0(%arg10: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg11: index):
                "memref.store"(%44, %47, %arg9, %arg10, %arg11) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg5: index):
            "scf.for"(%32, %45, %30) ({
            ^bb0(%arg6: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg7: index):
                "scf.for"(%32, %28, %30) ({
                ^bb0(%arg8: index):
                  %129 = "memref.load"(%62, %arg5, %arg6, %arg8) <{nontemporal = false}> : (memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>, index, index, index) -> i8
                  %130 = "memref.load"(%63, %arg5, %arg8, %arg7) <{nontemporal = false}> : (memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>, index, index, index) -> i8
                  %131 = "memref.load"(%47, %arg5, %arg6, %arg7) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
                  %132 = "arith.extsi"(%129) : (i8) -> i32
                  %133 = "arith.extsi"(%130) : (i8) -> i32
                  %134 = "arith.muli"(%132, %133) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %135 = "arith.addi"(%131, %134) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "memref.store"(%135, %47, %arg5, %arg6, %arg7) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
                  "scf.yield"() : () -> ()
                }) : (index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg2: index):
            "scf.for"(%32, %45, %30) ({
            ^bb0(%arg3: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg4: index):
                %64 = "memref.load"(%61, %arg4) <{nontemporal = false}> : (memref<64xf32, strided<[1], offset: ?>>, index) -> f32
                %65 = "memref.load"(%47, %arg2, %arg3, %arg4) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
                %66 = "arith.sitofp"(%65) : (i32) -> f32
                %67 = "arith.mulf"(%66, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %68 = "arith.addf"(%64, %67) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %69 = "arith.divf"(%68, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %70 = "math.round"(%69) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %71 = "arith.addf"(%70, %41) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %72 = "arith.cmpf"(%71, %40) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %73 = "arith.cmpf"(%71, %39) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
                %74 = "arith.select"(%72, %40, %71) : (i1, f32, f32) -> f32
                %75 = "arith.select"(%73, %39, %74) : (i1, f32, f32) -> f32
                %76 = "arith.fptosi"(%75) : (f32) -> i8
                %77 = "arith.extsi"(%76) : (i8) -> i32
                %78 = "arith.sitofp"(%77) : (i32) -> f32
                %79 = "arith.mulf"(%78, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %80 = "arith.divf"(%79, %38) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %81 = "arith.cmpf"(%80, %41) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %82 = "arith.negf"(%80) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %83 = "arith.select"(%81, %82, %80) : (i1, f32, f32) -> f32
                %84 = "arith.cmpf"(%83, %2) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %85 = "arith.select"(%84, %27, %23) : (i1, f32, f32) -> f32
                %86 = "arith.select"(%84, %14, %10) : (i1, f32, f32) -> f32
                %87 = "arith.select"(%84, %26, %22) : (i1, f32, f32) -> f32
                %88 = "arith.select"(%84, %13, %9) : (i1, f32, f32) -> f32
                %89 = "arith.select"(%84, %25, %21) : (i1, f32, f32) -> f32
                %90 = "arith.select"(%84, %12, %8) : (i1, f32, f32) -> f32
                %91 = "arith.select"(%84, %24, %20) : (i1, f32, f32) -> f32
                %92 = "arith.select"(%84, %11, %7) : (i1, f32, f32) -> f32
                %93 = "arith.cmpf"(%83, %1) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %94 = "arith.select"(%93, %41, %19) : (i1, f32, f32) -> f32
                %95 = "arith.select"(%93, %85, %18) : (i1, f32, f32) -> f32
                %96 = "arith.select"(%93, %86, %6) : (i1, f32, f32) -> f32
                %97 = "arith.select"(%93, %87, %17) : (i1, f32, f32) -> f32
                %98 = "arith.select"(%93, %88, %5) : (i1, f32, f32) -> f32
                %99 = "arith.select"(%93, %89, %16) : (i1, f32, f32) -> f32
                %100 = "arith.select"(%93, %90, %4) : (i1, f32, f32) -> f32
                %101 = "arith.select"(%93, %91, %15) : (i1, f32, f32) -> f32
                %102 = "arith.select"(%93, %92, %3) : (i1, f32, f32) -> f32
                %103 = "arith.select"(%93, %41, %37) : (i1, f32, f32) -> f32
                %104 = "arith.cmpf"(%83, %0) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %105 = "math.fma"(%83, %101, %99) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %106 = "math.fma"(%83, %105, %97) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %107 = "math.fma"(%83, %106, %95) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %108 = "math.fma"(%83, %107, %94) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %109 = "math.fma"(%83, %102, %100) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %110 = "math.fma"(%83, %109, %98) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %111 = "math.fma"(%83, %110, %96) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %112 = "math.fma"(%83, %111, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %113 = "arith.divf"(%108, %112) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %114 = "arith.addf"(%103, %113) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %115 = "arith.select"(%104, %114, %37) : (i1, f32, f32) -> f32
                %116 = "arith.negf"(%115) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %117 = "arith.select"(%81, %116, %115) : (i1, f32, f32) -> f32
                %118 = "arith.addf"(%117, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %119 = "arith.mulf"(%79, %118) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %120 = "arith.mulf"(%119, %36) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %121 = "arith.divf"(%120, %42) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %122 = "math.round"(%121) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %123 = "arith.addf"(%122, %41) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %124 = "arith.cmpf"(%123, %40) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %125 = "arith.cmpf"(%123, %39) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
                %126 = "arith.select"(%124, %40, %123) : (i1, f32, f32) -> f32
                %127 = "arith.select"(%125, %39, %126) : (i1, f32, f32) -> f32
                %128 = "arith.fptosi"(%127) : (f32) -> i8
                "memref.store"(%128, %60, %arg2, %arg3, %arg4) <{nontemporal = false}> : (i8, memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %106 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:1434:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 401408 bytes
    %174 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%173 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:24:3: note: called from
  func.func @torch_jit(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:1434:12: note: see current operation: 
 "func.func"() <{function_type = () -> (), sym_name = "torch_jit_dispatch_24_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"}> ({
  %0 = "arith.constant"() <{value = 3.750000e+00 : f32}> : () -> f32
  %1 = "arith.constant"() <{value = 2.000000e+00 : f32}> : () -> f32
  %2 = "arith.constant"() <{value = 8.000000e-01 : f32}> : () -> f32
  %3 = "arith.constant"() <{value = 0.0821908935 : f32}> : () -> f32
  %4 = "arith.constant"() <{value = -0.583389878 : f32}> : () -> f32
  %5 = "arith.constant"() <{value = 1.62705934 : f32}> : () -> f32
  %6 = "arith.constant"() <{value = -2.0606916 : f32}> : () -> f32
  %7 = "arith.constant"() <{value = 0.0572442785 : f32}> : () -> f32
  %8 = "arith.constant"() <{value = -0.0883462652 : f32}> : () -> f32
  %9 = "arith.constant"() <{value = 0.448369086 : f32}> : () -> f32
  %10 = "arith.constant"() <{value = -3.276070e-01 : f32}> : () -> f32
  %11 = "arith.constant"() <{value = 0.0739796459 : f32}> : () -> f32
  %12 = "arith.constant"() <{value = -0.131808966 : f32}> : () -> f32
  %13 = "arith.constant"() <{value = 0.519230127 : f32}> : () -> f32
  %14 = "arith.constant"() <{value = -0.463513821 : f32}> : () -> f32
  %15 = "arith.constant"() <{value = -1.71048032E-5 : f32}> : () -> f32
  %16 = "arith.constant"() <{value = 2.53447099E-4 : f32}> : () -> f32
  %17 = "arith.constant"() <{value = -0.00141373626 : f32}> : () -> f32
  %18 = "arith.constant"() <{value = 0.00351961935 : f32}> : () -> f32
  %19 = "arith.constant"() <{value = -0.00330093061 : f32}> : () -> f32
  %20 = "arith.constant"() <{value = 0.0370645523 : f32}> : () -> f32
  %21 = "arith.constant"() <{value = 0.118407398 : f32}> : () -> f32
  %22 = "arith.constant"() <{value = -0.364721417 : f32}> : () -> f32
  %23 = "arith.constant"() <{value = 1.12750685 : f32}> : () -> f32
  %24 = "arith.constant"() <{value = 0.0258146804 : f32}> : () -> f32
  %25 = "arith.constant"() <{value = 0.209741712 : f32}> : () -> f32
  %26 = "arith.constant"() <{value = -0.523018539 : f32}> : () -> f32
  %27 = "arith.constant"() <{value = 1.12837911 : f32}> : () -> f32
  %28 = "arith.constant"() <{value = 128 : index}> : () -> index
  %29 = "arith.constant"() <{value = 64 : index}> : () -> index
  %30 = "arith.constant"() <{value = 1 : index}> : () -> index
  %31 = "arith.constant"() <{value = 28 : index}> : () -> index
  %32 = "arith.constant"() <{value = 0 : index}> : () -> index
  %33 = "arith.constant"() <{value = 802816 : index}> : () -> index
  %34 = "arith.constant"() <{value = 86217280 : index}> : () -> index
  %35 = "arith.constant"() <{value = 2408448 : index}> : () -> index
  %36 = "arith.constant"() <{value = 401408 : index}> : () -> index
  %37 = "arith.constant"() <{value = 3.125000e-02 : f32}> : () -> f32
  %38 = "arith.constant"() <{value = 5.000000e-01 : f32}> : () -> f32
  %39 = "arith.constant"() <{value = 1.000000e+00 : f32}> : () -> f32
  %40 = "arith.constant"() <{value = 1.41421354 : f32}> : () -> f32
  %41 = "arith.constant"() <{value = 1.270000e+02 : f32}> : () -> f32
  %42 = "arith.constant"() <{value = -1.280000e+02 : f32}> : () -> f32
  %43 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
  %44 = "arith.constant"() <{value = 6.250000e-02 : f32}> : () -> f32
  %45 = "arith.constant"() <{value = 4.8828125E-4 : f32}> : () -> f32
  %46 = "arith.constant"() <{value = 0 : i32}> : () -> i32
  %47 = "arith.constant"() <{value = 56 : index}> : () -> index
  %48 = "arith.constant"() <{value = 512 : index}> : () -> index
  %49 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<28x56x64xi32>
  %50 = "hal.interface.binding.subspan"(%36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>
  "memref.assume_alignment"(%50) <{alignment = 64 : i32}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>) -> ()
  %51 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>
  "memref.assume_alignment"(%51) <{alignment = 64 : i32}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>) -> ()
  %52 = "hal.interface.binding.subspan"(%34) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512xf32, strided<[1], offset: 21554320>>
  "memref.assume_alignment"(%52) <{alignment = 64 : i32}> : (memref<512xf32, strided<[1], offset: 21554320>>) -> ()
  %53 = "hal.interface.binding.subspan"(%33) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>
  "memref.assume_alignment"(%53) <{alignment = 64 : i32}> : (memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>) -> ()
  %54 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
  %55 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
  %56 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
  %57 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
  %58 = "affine.apply"(%56) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
  %59 = "affine.apply"(%57) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
  "scf.for"(%58, %47, %59) ({
  ^bb0(%arg0: index):
    %60 = "affine.apply"(%54) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
    %61 = "affine.apply"(%55) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
    "scf.for"(%60, %48, %61) ({
    ^bb0(%arg1: index):
      %62 = "memref.subview"(%53, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 56, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>, index, index) -> memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>
      %63 = "memref.subview"(%52, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 64>, static_strides = array<i64: 1>}> : (memref<512xf32, strided<[1], offset: 21554320>>, index) -> memref<64xf32, strided<[1], offset: ?>>
      %64 = "memref.subview"(%50, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0>, static_sizes = array<i64: 28, 56, 128>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>, index) -> memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>
      %65 = "memref.subview"(%51, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 128, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>, index, index) -> memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg9: index):
        "scf.for"(%32, %47, %30) ({
        ^bb0(%arg10: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg11: index):
            "memref.store"(%46, %49, %arg9, %arg10, %arg11) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg5: index):
        "scf.for"(%32, %47, %30) ({
        ^bb0(%arg6: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg7: index):
            "scf.for"(%32, %28, %30) ({
            ^bb0(%arg8: index):
              %131 = "memref.load"(%64, %arg5, %arg6, %arg8) <{nontemporal = false}> : (memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>, index, index, index) -> i8
              %132 = "memref.load"(%65, %arg5, %arg8, %arg7) <{nontemporal = false}> : (memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>, index, index, index) -> i8
              %133 = "memref.load"(%49, %arg5, %arg6, %arg7) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
              %134 = "arith.extsi"(%131) : (i8) -> i32
              %135 = "arith.extsi"(%132) : (i8) -> i32
              %136 = "arith.muli"(%134, %135) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
              %137 = "arith.addi"(%133, %136) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
              "memref.store"(%137, %49, %arg5, %arg6, %arg7) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.for"(%32, %31, %30) ({
      ^bb0(%arg2: index):
        "scf.for"(%32, %47, %30) ({
        ^bb0(%arg3: index):
          "scf.for"(%32, %29, %30) ({
          ^bb0(%arg4: index):
            %66 = "memref.load"(%63, %arg4) <{nontemporal = false}> : (memref<64xf32, strided<[1], offset: ?>>, index) -> f32
            %67 = "memref.load"(%49, %arg2, %arg3, %arg4) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
            %68 = "arith.sitofp"(%67) : (i32) -> f32
            %69 = "arith.mulf"(%68, %45) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %70 = "arith.addf"(%66, %69) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %71 = "arith.divf"(%70, %44) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %72 = "math.round"(%71) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %73 = "arith.addf"(%72, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %74 = "arith.cmpf"(%73, %42) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %75 = "arith.cmpf"(%73, %41) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
            %76 = "arith.select"(%74, %42, %73) : (i1, f32, f32) -> f32
            %77 = "arith.select"(%75, %41, %76) : (i1, f32, f32) -> f32
            %78 = "arith.fptosi"(%77) : (f32) -> i8
            %79 = "arith.extsi"(%78) : (i8) -> i32
            %80 = "arith.sitofp"(%79) : (i32) -> f32
            %81 = "arith.mulf"(%80, %44) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %82 = "arith.divf"(%81, %40) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %83 = "arith.cmpf"(%82, %43) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %84 = "arith.negf"(%82) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %85 = "arith.select"(%83, %84, %82) : (i1, f32, f32) -> f32
            %86 = "arith.cmpf"(%85, %2) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %87 = "arith.select"(%86, %27, %23) : (i1, f32, f32) -> f32
            %88 = "arith.select"(%86, %14, %10) : (i1, f32, f32) -> f32
            %89 = "arith.select"(%86, %26, %22) : (i1, f32, f32) -> f32
            %90 = "arith.select"(%86, %13, %9) : (i1, f32, f32) -> f32
            %91 = "arith.select"(%86, %25, %21) : (i1, f32, f32) -> f32
            %92 = "arith.select"(%86, %12, %8) : (i1, f32, f32) -> f32
            %93 = "arith.select"(%86, %24, %20) : (i1, f32, f32) -> f32
            %94 = "arith.select"(%86, %11, %7) : (i1, f32, f32) -> f32
            %95 = "arith.cmpf"(%85, %1) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
            %96 = "arith.select"(%95, %43, %19) : (i1, f32, f32) -> f32
            %97 = "arith.select"(%95, %87, %18) : (i1, f32, f32) -> f32
            %98 = "arith.select"(%95, %88, %6) : (i1, f32, f32) -> f32
            %99 = "arith.select"(%95, %89, %17) : (i1, f32, f32) -> f32
            %100 = "arith.select"(%95, %90, %5) : (i1, f32, f32) -> f32
            %101 = "arith.select"(%95, %91, %16) : (i1, f32, f32) -> f32
            %102 = "arith.select"(%95, %92, %4) : (i1, f32, f32) -> f32
            %103 = "arith.select"(%95, %93, %15) : (i1, f32, f32) -> f32
            %104 = "arith.select"(%95, %94, %3) : (i1, f32, f32) -> f32
            %105 = "arith.select"(%95, %43, %39) : (i1, f32, f32) -> f32
            %106 = "arith.cmpf"(%85, %0) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %107 = "math.fma"(%85, %103, %101) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %108 = "math.fma"(%85, %107, %99) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %109 = "math.fma"(%85, %108, %97) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %110 = "math.fma"(%85, %109, %96) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %111 = "math.fma"(%85, %104, %102) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %112 = "math.fma"(%85, %111, %100) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %113 = "math.fma"(%85, %112, %98) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %114 = "math.fma"(%85, %113, %39) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
            %115 = "arith.divf"(%110, %114) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %116 = "arith.addf"(%105, %115) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %117 = "arith.select"(%106, %116, %39) : (i1, f32, f32) -> f32
            %118 = "arith.negf"(%117) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %119 = "arith.select"(%83, %118, %117) : (i1, f32, f32) -> f32
            %120 = "arith.addf"(%119, %39) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %121 = "arith.mulf"(%81, %120) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %122 = "arith.mulf"(%121, %38) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %123 = "arith.divf"(%122, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %124 = "math.round"(%123) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
            %125 = "arith.addf"(%124, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
            %126 = "arith.cmpf"(%125, %42) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
            %127 = "arith.cmpf"(%125, %41) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
            %128 = "arith.select"(%126, %42, %125) : (i1, f32, f32) -> f32
            %129 = "arith.select"(%127, %41, %128) : (i1, f32, f32) -> f32
            %130 = "arith.fptosi"(%129) : (f32) -> i8
            "memref.store"(%130, %62, %arg2, %arg3, %arg4) <{nontemporal = false}> : (i8, memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>, index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "scf.yield"() : () -> ()
    }) : (index, index, index) -> ()
    "scf.yield"() : () -> ()
  }) : (index, index, index) -> ()
  "func.return"() : () -> ()
 }) {translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
    %174 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%173 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:1434:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %174 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%173 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:24:3: note: called from
  func.func @torch_jit(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x1000xf32> {
  ^
 ConvNeXt_vaiq_int8.default.onnx.linalg.mlir:1434:12: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg12: !hal.device):
    %138 = "arith.constant"() <{value = 8 : index}> : () -> index
    %139 = "arith.constant"() <{value = 2 : index}> : () -> index
    %140 = "arith.constant"() <{value = 1 : index}> : () -> index
    "hal.return"(%138, %139, %140) : (index, index, index) -> ()
  }) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "torch_jit_dispatch_24_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "torch_jit_dispatch_24_quantized_batch_matmul_56x56x512x128_i8xi8xi32xi32xi32"}> ({
      %0 = "arith.constant"() <{value = 3.750000e+00 : f32}> : () -> f32
      %1 = "arith.constant"() <{value = 2.000000e+00 : f32}> : () -> f32
      %2 = "arith.constant"() <{value = 8.000000e-01 : f32}> : () -> f32
      %3 = "arith.constant"() <{value = 0.0821908935 : f32}> : () -> f32
      %4 = "arith.constant"() <{value = -0.583389878 : f32}> : () -> f32
      %5 = "arith.constant"() <{value = 1.62705934 : f32}> : () -> f32
      %6 = "arith.constant"() <{value = -2.0606916 : f32}> : () -> f32
      %7 = "arith.constant"() <{value = 0.0572442785 : f32}> : () -> f32
      %8 = "arith.constant"() <{value = -0.0883462652 : f32}> : () -> f32
      %9 = "arith.constant"() <{value = 0.448369086 : f32}> : () -> f32
      %10 = "arith.constant"() <{value = -3.276070e-01 : f32}> : () -> f32
      %11 = "arith.constant"() <{value = 0.0739796459 : f32}> : () -> f32
      %12 = "arith.constant"() <{value = -0.131808966 : f32}> : () -> f32
      %13 = "arith.constant"() <{value = 0.519230127 : f32}> : () -> f32
      %14 = "arith.constant"() <{value = -0.463513821 : f32}> : () -> f32
      %15 = "arith.constant"() <{value = -1.71048032E-5 : f32}> : () -> f32
      %16 = "arith.constant"() <{value = 2.53447099E-4 : f32}> : () -> f32
      %17 = "arith.constant"() <{value = -0.00141373626 : f32}> : () -> f32
      %18 = "arith.constant"() <{value = 0.00351961935 : f32}> : () -> f32
      %19 = "arith.constant"() <{value = -0.00330093061 : f32}> : () -> f32
      %20 = "arith.constant"() <{value = 0.0370645523 : f32}> : () -> f32
      %21 = "arith.constant"() <{value = 0.118407398 : f32}> : () -> f32
      %22 = "arith.constant"() <{value = -0.364721417 : f32}> : () -> f32
      %23 = "arith.constant"() <{value = 1.12750685 : f32}> : () -> f32
      %24 = "arith.constant"() <{value = 0.0258146804 : f32}> : () -> f32
      %25 = "arith.constant"() <{value = 0.209741712 : f32}> : () -> f32
      %26 = "arith.constant"() <{value = -0.523018539 : f32}> : () -> f32
      %27 = "arith.constant"() <{value = 1.12837911 : f32}> : () -> f32
      %28 = "arith.constant"() <{value = 128 : index}> : () -> index
      %29 = "arith.constant"() <{value = 64 : index}> : () -> index
      %30 = "arith.constant"() <{value = 1 : index}> : () -> index
      %31 = "arith.constant"() <{value = 28 : index}> : () -> index
      %32 = "arith.constant"() <{value = 0 : index}> : () -> index
      %33 = "arith.constant"() <{value = 802816 : index}> : () -> index
      %34 = "arith.constant"() <{value = 86217280 : index}> : () -> index
      %35 = "arith.constant"() <{value = 2408448 : index}> : () -> index
      %36 = "arith.constant"() <{value = 401408 : index}> : () -> index
      %37 = "arith.constant"() <{value = 3.125000e-02 : f32}> : () -> f32
      %38 = "arith.constant"() <{value = 5.000000e-01 : f32}> : () -> f32
      %39 = "arith.constant"() <{value = 1.000000e+00 : f32}> : () -> f32
      %40 = "arith.constant"() <{value = 1.41421354 : f32}> : () -> f32
      %41 = "arith.constant"() <{value = 1.270000e+02 : f32}> : () -> f32
      %42 = "arith.constant"() <{value = -1.280000e+02 : f32}> : () -> f32
      %43 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %44 = "arith.constant"() <{value = 6.250000e-02 : f32}> : () -> f32
      %45 = "arith.constant"() <{value = 4.8828125E-4 : f32}> : () -> f32
      %46 = "arith.constant"() <{value = 0 : i32}> : () -> i32
      %47 = "arith.constant"() <{value = 56 : index}> : () -> index
      %48 = "arith.constant"() <{value = 512 : index}> : () -> index
      %49 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<28x56x64xi32>
      %50 = "hal.interface.binding.subspan"(%36) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>
      "memref.assume_alignment"(%50) <{alignment = 64 : i32}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>) -> ()
      %51 = "hal.interface.binding.subspan"(%35) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>
      "memref.assume_alignment"(%51) <{alignment = 64 : i32}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>) -> ()
      %52 = "hal.interface.binding.subspan"(%34) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<512xf32, strided<[1], offset: 21554320>>
      "memref.assume_alignment"(%52) <{alignment = 64 : i32}> : (memref<512xf32, strided<[1], offset: 21554320>>) -> ()
      %53 = "hal.interface.binding.subspan"(%33) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>
      "memref.assume_alignment"(%53) <{alignment = 64 : i32}> : (memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>) -> ()
      %54 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %55 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
      %56 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %57 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
      %58 = "affine.apply"(%56) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      %59 = "affine.apply"(%57) <{map = affine_map<()[s0] -> (s0 * 28)>}> : (index) -> index
      "scf.for"(%58, %47, %59) ({
      ^bb0(%arg0: index):
        %60 = "affine.apply"(%54) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
        %61 = "affine.apply"(%55) <{map = affine_map<()[s0] -> (s0 * 64)>}> : (index) -> index
        "scf.for"(%60, %48, %61) ({
        ^bb0(%arg1: index):
          %62 = "memref.subview"(%53, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 56, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x512xi8, strided<[28672, 512, 1], offset: 802816>>, index, index) -> memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>
          %63 = "memref.subview"(%52, %arg1) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808>, static_sizes = array<i64: 64>, static_strides = array<i64: 1>}> : (memref<512xf32, strided<[1], offset: 21554320>>, index) -> memref<64xf32, strided<[1], offset: ?>>
          %64 = "memref.subview"(%50, %arg0) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0>, static_sizes = array<i64: 28, 56, 128>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x56x128xi8, strided<[7168, 128, 1], offset: 401408>>, index) -> memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>
          %65 = "memref.subview"(%51, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808>, static_sizes = array<i64: 28, 128, 64>, static_strides = array<i64: 1, 1, 1>}> : (memref<56x128x512xi8, strided<[65536, 512, 1], offset: 2408448>>, index, index) -> memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg9: index):
            "scf.for"(%32, %47, %30) ({
            ^bb0(%arg10: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg11: index):
                "memref.store"(%46, %49, %arg9, %arg10, %arg11) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg5: index):
            "scf.for"(%32, %47, %30) ({
            ^bb0(%arg6: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg7: index):
                "scf.for"(%32, %28, %30) ({
                ^bb0(%arg8: index):
                  %131 = "memref.load"(%64, %arg5, %arg6, %arg8) <{nontemporal = false}> : (memref<28x56x128xi8, strided<[7168, 128, 1], offset: ?>>, index, index, index) -> i8
                  %132 = "memref.load"(%65, %arg5, %arg8, %arg7) <{nontemporal = false}> : (memref<28x128x64xi8, strided<[65536, 512, 1], offset: ?>>, index, index, index) -> i8
                  %133 = "memref.load"(%49, %arg5, %arg6, %arg7) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
                  %134 = "arith.extsi"(%131) : (i8) -> i32
                  %135 = "arith.extsi"(%132) : (i8) -> i32
                  %136 = "arith.muli"(%134, %135) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  %137 = "arith.addi"(%133, %136) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
                  "memref.store"(%137, %49, %arg5, %arg6, %arg7) <{nontemporal = false}> : (i32, memref<28x56x64xi32>, index, index, index) -> ()
                  "scf.yield"() : () -> ()
                }) : (index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.for"(%32, %31, %30) ({
          ^bb0(%arg2: index):
            "scf.for"(%32, %47, %30) ({
            ^bb0(%arg3: index):
              "scf.for"(%32, %29, %30) ({
              ^bb0(%arg4: index):
                %66 = "memref.load"(%63, %arg4) <{nontemporal = false}> : (memref<64xf32, strided<[1], offset: ?>>, index) -> f32
                %67 = "memref.load"(%49, %arg2, %arg3, %arg4) <{nontemporal = false}> : (memref<28x56x64xi32>, index, index, index) -> i32
                %68 = "arith.sitofp"(%67) : (i32) -> f32
                %69 = "arith.mulf"(%68, %45) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %70 = "arith.addf"(%66, %69) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %71 = "arith.divf"(%70, %44) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %72 = "math.round"(%71) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %73 = "arith.addf"(%72, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %74 = "arith.cmpf"(%73, %42) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %75 = "arith.cmpf"(%73, %41) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
                %76 = "arith.select"(%74, %42, %73) : (i1, f32, f32) -> f32
                %77 = "arith.select"(%75, %41, %76) : (i1, f32, f32) -> f32
                %78 = "arith.fptosi"(%77) : (f32) -> i8
                %79 = "arith.extsi"(%78) : (i8) -> i32
                %80 = "arith.sitofp"(%79) : (i32) -> f32
                %81 = "arith.mulf"(%80, %44) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %82 = "arith.divf"(%81, %40) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %83 = "arith.cmpf"(%82, %43) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %84 = "arith.negf"(%82) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %85 = "arith.select"(%83, %84, %82) : (i1, f32, f32) -> f32
                %86 = "arith.cmpf"(%85, %2) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %87 = "arith.select"(%86, %27, %23) : (i1, f32, f32) -> f32
                %88 = "arith.select"(%86, %14, %10) : (i1, f32, f32) -> f32
                %89 = "arith.select"(%86, %26, %22) : (i1, f32, f32) -> f32
                %90 = "arith.select"(%86, %13, %9) : (i1, f32, f32) -> f32
                %91 = "arith.select"(%86, %25, %21) : (i1, f32, f32) -> f32
                %92 = "arith.select"(%86, %12, %8) : (i1, f32, f32) -> f32
                %93 = "arith.select"(%86, %24, %20) : (i1, f32, f32) -> f32
                %94 = "arith.select"(%86, %11, %7) : (i1, f32, f32) -> f32
                %95 = "arith.cmpf"(%85, %1) <{fastmath = #arith.fastmath<none>, predicate = 4 : i64}> : (f32, f32) -> i1
                %96 = "arith.select"(%95, %43, %19) : (i1, f32, f32) -> f32
                %97 = "arith.select"(%95, %87, %18) : (i1, f32, f32) -> f32
                %98 = "arith.select"(%95, %88, %6) : (i1, f32, f32) -> f32
                %99 = "arith.select"(%95, %89, %17) : (i1, f32, f32) -> f32
                %100 = "arith.select"(%95, %90, %5) : (i1, f32, f32) -> f32
                %101 = "arith.select"(%95, %91, %16) : (i1, f32, f32) -> f32
                %102 = "arith.select"(%95, %92, %4) : (i1, f32, f32) -> f32
                %103 = "arith.select"(%95, %93, %15) : (i1, f32, f32) -> f32
                %104 = "arith.select"(%95, %94, %3) : (i1, f32, f32) -> f32
                %105 = "arith.select"(%95, %43, %39) : (i1, f32, f32) -> f32
                %106 = "arith.cmpf"(%85, %0) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %107 = "math.fma"(%85, %103, %101) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %108 = "math.fma"(%85, %107, %99) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %109 = "math.fma"(%85, %108, %97) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %110 = "math.fma"(%85, %109, %96) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %111 = "math.fma"(%85, %104, %102) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %112 = "math.fma"(%85, %111, %100) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %113 = "math.fma"(%85, %112, %98) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %114 = "math.fma"(%85, %113, %39) <{fastmath = #arith.fastmath<none>}> : (f32, f32, f32) -> f32
                %115 = "arith.divf"(%110, %114) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %116 = "arith.addf"(%105, %115) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %117 = "arith.select"(%106, %116, %39) : (i1, f32, f32) -> f32
                %118 = "arith.negf"(%117) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %119 = "arith.select"(%83, %118, %117) : (i1, f32, f32) -> f32
                %120 = "arith.addf"(%119, %39) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %121 = "arith.mulf"(%81, %120) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %122 = "arith.mulf"(%121, %38) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %123 = "arith.divf"(%122, %37) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %124 = "math.round"(%123) <{fastmath = #arith.fastmath<none>}> : (f32) -> f32
                %125 = "arith.addf"(%124, %43) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
                %126 = "arith.cmpf"(%125, %42) <{fastmath = #arith.fastmath<none>, predicate = 11 : i64}> : (f32, f32) -> i1
                %127 = "arith.cmpf"(%125, %41) <{fastmath = #arith.fastmath<none>, predicate = 9 : i64}> : (f32, f32) -> i1
                %128 = "arith.select"(%126, %42, %125) : (i1, f32, f32) -> f32
                %129 = "arith.select"(%127, %41, %128) : (i1, f32, f32) -> f32
                %130 = "arith.fptosi"(%129) : (f32) -> i8
                "memref.store"(%130, %62, %arg2, %arg3, %arg4) <{nontemporal = false}> : (i8, memref<28x56x64xi8, strided<[28672, 512, 1], offset: ?>>, index, index, index) -> ()
                "scf.yield"() : () -> ()
              }) : (index, index, index) -> ()
              "scf.yield"() : () -> ()
            }) : (index, index, index) -> ()
            "scf.yield"() : () -> ()
          }) : (index, index, index) -> ()
          "scf.yield"() : () -> ()
        }) : (index, index, index) -> ()
        "scf.yield"() : () -> ()
      }) : (index, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<CPUDefault>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
    %174 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%173 : tensor<1x56x56x512xf32>) outs(%98 : tensor<1x56x56x512xi8>) {
           ^
No results found