pashu123 · November 22, 2024 13:50
diff --git a/err.txt b/err.txt
 failed to translate executables
 prefill_8b_tp8.mlir:9903:13: error: One or more operations with large vector sizes (8192 bytes) were found:

    %3425 = torch.aten.transpose.int %3417#0, %int1_1244, %int2_1245 : !torch.vtensor<[4,4,?,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[4,?,4,128],f16>
            ^
 prefill_8b_tp8.mlir:9857:15: note:   %67 = vector.transfer_read %extracted_slice_8[%c0, %c0, %c0, %c0, %c0, %c0], %cst_3, %66 {in_bounds = [true, true, true, true, true, true]} : tensor<4x1x?x1x1x128xf16>, vector<4x1x8x1x1x128xf16>

    %3417:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3393, %3401, %3409, %float0.000000e00, %true_1213, %none_1214, %none_1215) : (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?],f32>) 
              ^
 prefill_8b_tp8.mlir:9857:15: note:   %69 = arith.extf %67 : vector<4x1x8x1x1x128xf16> to vector<4x1x8x1x1x128xf32>

 prefill_8b_tp8.mlir:9857:15: note:   %108 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "parallel"], kind = #vector.kind<add>} %52, %69, %68 : vector<4x1x1x128xf32>, vector<4x1x8x1x1x128xf32> into vector<4x1x1x8x1x1xf32>

 prefill_8b_tp8.mlir:9903:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
    %3425 = torch.aten.transpose.int %3417#0, %int1_1244, %int2_1245 : !torch.vtensor<[4,4,?,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[4,?,4,128],f16>
            ^
 prefill_8b_tp8.mlir:9903:13: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg16: !hal.device, %arg17: index):
    %136:3 = "flow.dispatch.workgroup_count_from_slice"(%arg17) : (index) -> (index, index, index)
    "hal.return"(%136#0, %136#1, %136#2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16"} : () -> ()
  "builtin.module"() ({
    "func.func"() <{function_type = () -> (), sym_name = "prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16"}> ({
      %0 = "arith.constant"() <{value = dense<1.000000e+00> : vector<8x4x1x1xf32>}> : () -> vector<8x4x1x1xf32>
      %1 = "arith.constant"() <{value = dense<1.274410e-01> : vector<4x1x1x128xf16>}> : () -> vector<4x1x1x128xf16>
      %2 = "arith.constant"() <{value = dense<1.44269502> : vector<4x1x1x8x1x1xf32>}> : () -> vector<4x1x1x8x1x1xf32>
      %3 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1x8x1x1xf32>}> : () -> vector<4x1x1x8x1x1xf32>
      %4 = "arith.constant"() <{value = 0.000000e+00 : f16}> : () -> f16
      %5 = "arith.constant"() <{value = 128 : index}> : () -> index
      %6 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1xf32>}> : () -> vector<4x1x1xf32>
      %7 = "arith.constant"() <{value = dense<-3.40282347E+38> : vector<4x1x1xf32>}> : () -> vector<4x1x1xf32>
      %8 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1x8xf32>}> : () -> vector<4x1x1x8xf32>
      %9 = "arith.constant"() <{value = 4 : index}> : () -> index
      %10 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
      %11 = "arith.constant"() <{value = 8 : index}> : () -> index
      %12 = "arith.constant"() <{value = 32 : index}> : () -> index
      %13 = "arith.constant"() <{value = 1 : index}> : () -> index
      %14 = "arith.constant"() <{value = 32 : i64}> : () -> i64
      %15 = "arith.constant"() <{value = 0 : index}> : () -> index
      %16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
      %17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
      %18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
      %19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
      %20 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
      %21 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
      %22 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
      %23 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
      %24 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
      %25 = "arith.extui"(%16) : (i32) -> i64
      %26 = "arith.extui"(%17) : (i32) -> i64
      %27 = "arith.shli"(%26, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
      %29 = "arith.index_castui"(%28) : (i64) -> index
      %30 = "arith.extui"(%18) : (i32) -> i64
      %31 = "arith.extui"(%19) : (i32) -> i64
      %32 = "arith.shli"(%31, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
      %34 = "arith.index_castui"(%33) : (i64) -> index
      %35 = "arith.extui"(%20) : (i32) -> i64
      %36 = "arith.extui"(%21) : (i32) -> i64
      %37 = "arith.shli"(%36, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %38 = "arith.ori"(%35, %37) : (i64, i64) -> i64
      %39 = "arith.index_castui"(%38) : (i64) -> index
      %40 = "arith.extui"(%22) : (i32) -> i64
      %41 = "arith.extui"(%23) : (i32) -> i64
      %42 = "arith.shli"(%41, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
      %43 = "arith.ori"(%40, %42) : (i64, i64) -> i64
      %44 = "arith.index_castui"(%43) : (i64) -> index
      %45 = "arith.index_castui"(%24) : (i32) -> index
      %46 = "util.assume.int"(%45) <{assumptions = [[#util.int.assumption<umin = 16, umax = 131056, udiv = 16>]]}> : (index) -> index
      %47 = "flow.dispatch.workload.ordinal"(%46) <{ordinal = 0 : index}> : (index) -> index
      %48 = "hal.interface.binding.subspan"(%29, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x128xf16>>
      %49 = "hal.interface.binding.subspan"(%34, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>
      %50 = "hal.interface.binding.subspan"(%39, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>
      %51 = "hal.interface.binding.subspan"(%15, %47, %47) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 2>} : (index, index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x?x1x1xf16>>
      %52 = "hal.interface.binding.subspan"(%44, %47) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<writeonly:tensor<4x?x4x128xf16>>
      %53 = "flow.dispatch.tensor.load"(%48, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x128xf16>>, index, index) -> tensor<4x4x?x128xf16>
      %54 = "flow.dispatch.tensor.load"(%49, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>, index, index) -> tensor<4x4x?x1x1x128xf16>
      %55 = "flow.dispatch.tensor.load"(%50, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>, index, index) -> tensor<4x4x?x1x1x128xf16>
      %56 = "flow.dispatch.tensor.load"(%51, %47, %47, %47, %47) <{operandSegmentSizes = array<i32: 1, 2, 0, 2, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, -9223372036854775808, 1, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x?x1x1xf16>>, index, index, index, index) -> tensor<4x4x?x?x1x1xf16>
      %57 = "tensor.empty"(%47) : (index) -> tensor<4x?x4x128xf16>
      %58 = "scf.forall"(%47, %57) <{mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 1>, staticLowerBound = array<i64: 0, 0, 0>, staticStep = array<i64: 1, 64, 32>, staticUpperBound = array<i64: 4, -9223372036854775808, 128>}> ({
      ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: tensor<4x?x4x128xf16>):
        %59 = "affine.min"(%arg1, %47) <{map = affine_map<(d0)[s0] -> (-d0 + s0, 64)>}> : (index, index) -> index
        %60 = "tensor.empty"(%59) : (index) -> tensor<4x1x?x32xf16>
        %61 = "tensor.empty"() : () -> tensor<4x1x1xf32>
        %62 = "vector.transfer_write"(%7, %61, %15, %15, %15) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<4x1x1xf32>, tensor<4x1x1xf32>, index, index, index) -> tensor<4x1x1xf32>
        %63 = "scf.for"(%15, %59, %13, %60) ({
        ^bb0(%arg8: index, %arg9: tensor<4x1x?x32xf16>):
          %70 = "affine.apply"(%arg8, %arg1) <{map = affine_map<()[s0, s1] -> (s0 + s1)>}> : (index, index) -> index
          %71 = "vector.transfer_read"(%53, %15, %arg0, %70, %15, %4) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 4, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (tensor<4x4x?x128xf16>, index, index, index, index, f16) -> vector<4x1x1x128xf16>
          %72 = "arith.mulf"(%71, %1) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x128xf16>, vector<4x1x1x128xf16>) -> vector<4x1x1x128xf16>
          %73 = "arith.extf"(%72) : (vector<4x1x1x128xf16>) -> vector<4x1x1x128xf32>
          %74 = "scf.for"(%15, %12, %11, %arg9) ({
          ^bb0(%arg10: index, %arg11: tensor<4x1x?x32xf16>):
            %75 = "affine.apply"(%arg10, %arg2) <{map = affine_map<()[s0, s1] -> (s0 + s1)>}> : (index, index) -> index
            %76:3 = "scf.for"(%15, %47, %11, %62, %8, %6) ({
            ^bb0(%arg12: index, %arg13: tensor<4x1x1xf32>, %arg14: vector<4x1x1x8xf32>, %arg15: vector<4x1x1xf32>):
              %83 = "affine.min"(%arg12, %47) <{map = affine_map<(d0)[s0] -> (-d0 + s0, 8)>}> : (index, index) -> index
              %84 = "tensor.extract_slice"(%54, %arg0, %arg12, %83) <{operandSegmentSizes = array<i32: 1, 2, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 4, 1, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x1x1x128xf16>, index, index, index) -> tensor<4x1x?x1x1x128xf16>
              %85 = "tensor.extract_slice"(%55, %arg0, %arg12, %75, %83) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, -9223372036854775808>, static_sizes = array<i64: 4, 1, -9223372036854775808, 1, 1, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x1x1x128xf16>, index, index, index, index) -> tensor<4x1x?x1x1x8xf16>
              %86 = "tensor.extract_slice"(%56, %arg0, %70, %arg12, %83) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 4, 1, 1, -9223372036854775808, 1, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x?x1x1xf16>, index, index, index, index) -> tensor<4x1x1x?x1x1xf16>
              %87 = "tensor.empty"(%83) : (index) -> tensor<4x1x1x?x1x1xf32>
              %88 = "vector.create_mask"(%9, %13, %13, %83, %13, %13) : (index, index, index, index, index, index) -> vector<4x1x1x8x1x1xi1>
              %89 = "vector.transfer_write"(%3, %87, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
              %90 = "vector.create_mask"(%9, %13, %83, %13, %13, %5) : (index, index, index, index, index, index) -> vector<4x1x8x1x1x128xi1>
              %91 = "vector.transfer_read"(%84, %15, %15, %15, %15, %15, %15, %4, %90) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x?x1x1x128xf16>, index, index, index, index, index, index, f16, vector<4x1x8x1x1x128xi1>) -> vector<4x1x8x1x1x128xf16>
              %92 = "vector.transfer_read"(%89, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %93 = "arith.extf"(%91) : (vector<4x1x8x1x1x128xf16>) -> vector<4x1x8x1x1x128xf32>
              %94 = "vector.create_mask"(%9, %13, %13, %5, %83, %13, %13) : (index, index, index, index, index, index, index) -> vector<4x1x1x128x8x1x1xi1>
              %95 = "vector.mask"(%94) ({
                %135 = "vector.contract"(%73, %93, %92) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>], kind = #vector.kind<add>}> : (vector<4x1x1x128xf32>, vector<4x1x8x1x1x128xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
                "vector.yield"(%135) : (vector<4x1x1x8x1x1xf32>) -> ()
              }) : (vector<4x1x1x128x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %96 = "vector.transfer_write"(%95, %89, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
              %97 = "vector.transfer_read"(%96, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %98 = "vector.transfer_write"(%97, %96, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
              %99 = "vector.transfer_read"(%86, %15, %15, %15, %15, %15, %15, %4, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, f16, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf16>
              %100 = "vector.transfer_read"(%98, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %101 = "arith.extf"(%99) : (vector<4x1x1x8x1x1xf16>) -> vector<4x1x1x8x1x1xf32>
              %102 = "arith.mulf"(%101, %2) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
              %103 = "arith.addf"(%100, %102) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
              %104 = "vector.transfer_write"(%103, %98, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
              %105 = "vector.transfer_read"(%104, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %106 = "vector.transfer_read"(%arg13, %15, %15, %15, %10) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<4x1x1xf32>, index, index, index, f32) -> vector<4x1x1xf32>
              %107 = "vector.mask"(%88) ({
                %134 = "vector.multi_reduction"(%105, %106) <{kind = #vector.kind<maximumf>, reduction_dims = array<i64: 3, 4, 5>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
                "vector.yield"(%134) : (vector<4x1x1xf32>) -> ()
              }) : (vector<4x1x1x8x1x1xi1>) -> vector<4x1x1xf32>
              %108 = "vector.transfer_write"(%107, %arg13, %15, %15, %15) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<4x1x1xf32>, tensor<4x1x1xf32>, index, index, index) -> tensor<4x1x1xf32>
              %109 = "arith.subf"(%106, %107) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
              %110 = "math.exp2"(%109) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>) -> vector<4x1x1xf32>
              %111 = "arith.mulf"(%110, %arg15) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
              %112 = "vector.broadcast"(%107) : (vector<4x1x1xf32>) -> vector<8x1x1x4x1x1xf32>
              %113 = "vector.transpose"(%112) <{permutation = array<i64: 3, 4, 5, 0, 1, 2>}> : (vector<8x1x1x4x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
              %114 = "arith.subf"(%105, %113) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
              %115 = "math.exp2"(%114) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
              %116 = "vector.transfer_write"(%115, %104, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
              %117 = "vector.transfer_read"(%116, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
              %118 = "vector.mask"(%88) ({
                %133 = "vector.multi_reduction"(%117, %111) <{kind = #vector.kind<add>, reduction_dims = array<i64: 3, 4, 5>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
                "vector.yield"(%133) : (vector<4x1x1xf32>) -> ()
              }) : (vector<4x1x1x8x1x1xi1>) -> vector<4x1x1xf32>
              %119 = "tensor.empty"(%83) : (index) -> tensor<4x1x1x?x1x1xf16>
              %120 = "arith.truncf"(%117) : (vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf16>
              %121 = "vector.transfer_write"(%120, %119, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf16>, tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf16>
              %122 = "vector.broadcast"(%110) : (vector<4x1x1xf32>) -> vector<8x4x1x1xf32>
              %123 = "vector.transpose"(%122) <{permutation = array<i64: 1, 2, 3, 0>}> : (vector<8x4x1x1xf32>) -> vector<4x1x1x8xf32>
              %124 = "arith.mulf"(%123, %arg14) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
              %125 = "vector.transfer_read"(%121, %15, %15, %15, %15, %15, %15, %4, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, f16, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf16>
              %126 = "vector.create_mask"(%9, %13, %83, %13, %13, %11) : (index, index, index, index, index, index) -> vector<4x1x8x1x1x8xi1>
              %127 = "vector.transfer_read"(%85, %15, %15, %15, %15, %15, %15, %4, %126) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x?x1x1x8xf16>, index, index, index, index, index, index, f16, vector<4x1x8x1x1x8xi1>) -> vector<4x1x8x1x1x8xf16>
              %128 = "arith.extf"(%125) : (vector<4x1x1x8x1x1xf16>) -> vector<4x1x1x8x1x1xf32>
              %129 = "arith.extf"(%127) : (vector<4x1x8x1x1x8xf16>) -> vector<4x1x8x1x1x8xf32>
              %130 = "vector.create_mask"(%9, %13, %13, %11, %83, %13, %13) : (index, index, index, index, index, index, index) -> vector<4x1x1x8x8x1x1xi1>
              %131 = "vector.mask"(%130) ({
                %132 = "vector.contract"(%128, %129, %124) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>, #vector.iterator_type<reduction>, #vector.iterator_type<reduction>], kind = #vector.kind<add>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x8x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
                "vector.yield"(%132) : (vector<4x1x1x8xf32>) -> ()
              }) : (vector<4x1x1x8x8x1x1xi1>) -> vector<4x1x1x8xf32>
              "scf.yield"(%108, %131, %118) : (tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>) -> ()
            }) : (index, index, index, tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>) -> (tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>)
            %77 = "vector.broadcast"(%76#2) : (vector<4x1x1xf32>) -> vector<8x4x1x1xf32>
            %78 = "arith.divf"(%0, %77) <{fastmath = #arith.fastmath<none>}> : (vector<8x4x1x1xf32>, vector<8x4x1x1xf32>) -> vector<8x4x1x1xf32>
            %79 = "vector.transpose"(%78) <{permutation = array<i64: 1, 2, 3, 0>}> : (vector<8x4x1x1xf32>) -> vector<4x1x1x8xf32>
            %80 = "arith.mulf"(%79, %76#1) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
            %81 = "arith.truncf"(%80) : (vector<4x1x1x8xf32>) -> vector<4x1x1x8xf16>
            %82 = "vector.transfer_write"(%81, %arg11, %15, %15, %arg8, %arg10) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 4, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (vector<4x1x1x8xf16>, tensor<4x1x?x32xf16>, index, index, index, index) -> tensor<4x1x?x32xf16>
            "scf.yield"(%82) : (tensor<4x1x?x32xf16>) -> ()
          }) : (index, index, index, tensor<4x1x?x32xf16>) -> tensor<4x1x?x32xf16>
          "scf.yield"(%74) : (tensor<4x1x?x32xf16>) -> ()
        }) : (index, index, index, tensor<4x1x?x32xf16>) -> tensor<4x1x?x32xf16>
        %64 = "tensor.extract_slice"(%arg3, %arg1, %arg0, %arg2, %59) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 4, -9223372036854775808, 1, 32>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x4x128xf16>, index, index, index, index) -> tensor<4x?x1x32xf16>
        %65 = "scf.for"(%15, %59, %13, %64) ({
        ^bb0(%arg4: index, %arg5: tensor<4x?x1x32xf16>):
          %66 = "scf.for"(%15, %12, %11, %arg5) ({
          ^bb0(%arg6: index, %arg7: tensor<4x?x1x32xf16>):
            %67 = "vector.transfer_read"(%63, %15, %15, %arg4, %arg6, %4) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 4, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (tensor<4x1x?x32xf16>, index, index, index, index, f16) -> vector<4x1x1x8xf16>
            %68 = "vector.transpose"(%67) <{permutation = array<i64: 0, 2, 1, 3>}> : (vector<4x1x1x8xf16>) -> vector<4x1x1x8xf16>
            %69 = "vector.transfer_write"(%68, %arg7, %15, %arg4, %15, %arg6) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 4, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (vector<4x1x1x8xf16>, tensor<4x?x1x32xf16>, index, index, index, index) -> tensor<4x?x1x32xf16>
            "scf.yield"(%69) : (tensor<4x?x1x32xf16>) -> ()
          }) : (index, index, index, tensor<4x?x1x32xf16>) -> tensor<4x?x1x32xf16>
          "scf.yield"(%66) : (tensor<4x?x1x32xf16>) -> ()
        }) : (index, index, index, tensor<4x?x1x32xf16>) -> tensor<4x?x1x32xf16>
        "scf.forall.in_parallel"() ({
          "tensor.parallel_insert_slice"(%65, %arg3, %arg1, %arg0, %arg2, %59) <{operandSegmentSizes = array<i32: 1, 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 4, -9223372036854775808, 1, 32>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x1x32xf16>, tensor<4x?x4x128xf16>, index, index, index, index) -> ()
        }) : () -> ()
      }) : (index, tensor<4x?x4x128xf16>) -> tensor<4x?x4x128xf16>
      "flow.dispatch.tensor.store"(%58, %52, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 4, -9223372036854775808, 4, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x4x128xf16>, !flow.dispatch.tensor<writeonly:tensor<4x?x4x128xf16>>, index, index) -> ()
      "func.return"() : () -> ()
    }) {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()