Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created November 22, 2024 13:50
Show Gist options
  • Save pashu123/7f9e4b6057be8d860855825a26496166 to your computer and use it in GitHub Desktop.
Save pashu123/7f9e4b6057be8d860855825a26496166 to your computer and use it in GitHub Desktop.
failed to translate executables
prefill_8b_tp8.mlir:9903:13: error: One or more operations with large vector sizes (8192 bytes) were found:
%3425 = torch.aten.transpose.int %3417#0, %int1_1244, %int2_1245 : !torch.vtensor<[4,4,?,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[4,?,4,128],f16>
^
prefill_8b_tp8.mlir:9857:15: note: %67 = vector.transfer_read %extracted_slice_8[%c0, %c0, %c0, %c0, %c0, %c0], %cst_3, %66 {in_bounds = [true, true, true, true, true, true]} : tensor<4x1x?x1x1x128xf16>, vector<4x1x8x1x1x128xf16>
%3417:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3393, %3401, %3409, %float0.000000e00, %true_1213, %none_1214, %none_1215) : (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[4,4,?,128],f16>, !torch.vtensor<[4,4,?],f32>)
^
prefill_8b_tp8.mlir:9857:15: note: %69 = arith.extf %67 : vector<4x1x8x1x1x128xf16> to vector<4x1x8x1x1x128xf32>
prefill_8b_tp8.mlir:9857:15: note: %108 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "parallel"], kind = #vector.kind<add>} %52, %69, %68 : vector<4x1x1x128xf32>, vector<4x1x8x1x1x128xf32> into vector<4x1x1x8x1x1xf32>
prefill_8b_tp8.mlir:9903:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
%3425 = torch.aten.transpose.int %3417#0, %int1_1244, %int2_1245 : !torch.vtensor<[4,4,?,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[4,?,4,128],f16>
^
prefill_8b_tp8.mlir:9903:13: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg16: !hal.device, %arg17: index):
%136:3 = "flow.dispatch.workgroup_count_from_slice"(%arg17) : (index) -> (index, index, index)
"hal.return"(%136#0, %136#1, %136#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "prefill_bs4$async_dispatch_122_transpose_4x4xDx128_f16"}> ({
%0 = "arith.constant"() <{value = dense<1.000000e+00> : vector<8x4x1x1xf32>}> : () -> vector<8x4x1x1xf32>
%1 = "arith.constant"() <{value = dense<1.274410e-01> : vector<4x1x1x128xf16>}> : () -> vector<4x1x1x128xf16>
%2 = "arith.constant"() <{value = dense<1.44269502> : vector<4x1x1x8x1x1xf32>}> : () -> vector<4x1x1x8x1x1xf32>
%3 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1x8x1x1xf32>}> : () -> vector<4x1x1x8x1x1xf32>
%4 = "arith.constant"() <{value = 0.000000e+00 : f16}> : () -> f16
%5 = "arith.constant"() <{value = 128 : index}> : () -> index
%6 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1xf32>}> : () -> vector<4x1x1xf32>
%7 = "arith.constant"() <{value = dense<-3.40282347E+38> : vector<4x1x1xf32>}> : () -> vector<4x1x1xf32>
%8 = "arith.constant"() <{value = dense<0.000000e+00> : vector<4x1x1x8xf32>}> : () -> vector<4x1x1x8xf32>
%9 = "arith.constant"() <{value = 4 : index}> : () -> index
%10 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
%11 = "arith.constant"() <{value = 8 : index}> : () -> index
%12 = "arith.constant"() <{value = 32 : index}> : () -> index
%13 = "arith.constant"() <{value = 1 : index}> : () -> index
%14 = "arith.constant"() <{value = 32 : i64}> : () -> i64
%15 = "arith.constant"() <{value = 0 : index}> : () -> index
%16 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index} : () -> i32
%17 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 1 : index} : () -> i32
%18 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 2 : index} : () -> i32
%19 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 3 : index} : () -> i32
%20 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 4 : index} : () -> i32
%21 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 5 : index} : () -> i32
%22 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 6 : index} : () -> i32
%23 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 7 : index} : () -> i32
%24 = "hal.interface.constant.load"() {layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 8 : index} : () -> i32
%25 = "arith.extui"(%16) : (i32) -> i64
%26 = "arith.extui"(%17) : (i32) -> i64
%27 = "arith.shli"(%26, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%28 = "arith.ori"(%25, %27) : (i64, i64) -> i64
%29 = "arith.index_castui"(%28) : (i64) -> index
%30 = "arith.extui"(%18) : (i32) -> i64
%31 = "arith.extui"(%19) : (i32) -> i64
%32 = "arith.shli"(%31, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%33 = "arith.ori"(%30, %32) : (i64, i64) -> i64
%34 = "arith.index_castui"(%33) : (i64) -> index
%35 = "arith.extui"(%20) : (i32) -> i64
%36 = "arith.extui"(%21) : (i32) -> i64
%37 = "arith.shli"(%36, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%38 = "arith.ori"(%35, %37) : (i64, i64) -> i64
%39 = "arith.index_castui"(%38) : (i64) -> index
%40 = "arith.extui"(%22) : (i32) -> i64
%41 = "arith.extui"(%23) : (i32) -> i64
%42 = "arith.shli"(%41, %14) <{overflowFlags = #arith.overflow<none>}> : (i64, i64) -> i64
%43 = "arith.ori"(%40, %42) : (i64, i64) -> i64
%44 = "arith.index_castui"(%43) : (i64) -> index
%45 = "arith.index_castui"(%24) : (i32) -> index
%46 = "util.assume.int"(%45) <{assumptions = [[#util.int.assumption<umin = 16, umax = 131056, udiv = 16>]]}> : (index) -> index
%47 = "flow.dispatch.workload.ordinal"(%46) <{ordinal = 0 : index}> : (index) -> index
%48 = "hal.interface.binding.subspan"(%29, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x128xf16>>
%49 = "hal.interface.binding.subspan"(%34, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>
%50 = "hal.interface.binding.subspan"(%39, %47) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>
%51 = "hal.interface.binding.subspan"(%15, %47, %47) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 2>} : (index, index, index) -> !flow.dispatch.tensor<readonly:tensor<4x4x?x?x1x1xf16>>
%52 = "hal.interface.binding.subspan"(%44, %47) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 1>} : (index, index) -> !flow.dispatch.tensor<writeonly:tensor<4x?x4x128xf16>>
%53 = "flow.dispatch.tensor.load"(%48, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x128xf16>>, index, index) -> tensor<4x4x?x128xf16>
%54 = "flow.dispatch.tensor.load"(%49, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>, index, index) -> tensor<4x4x?x1x1x128xf16>
%55 = "flow.dispatch.tensor.load"(%50, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x1x1x128xf16>>, index, index) -> tensor<4x4x?x1x1x128xf16>
%56 = "flow.dispatch.tensor.load"(%51, %47, %47, %47, %47) <{operandSegmentSizes = array<i32: 1, 2, 0, 2, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 4, 4, -9223372036854775808, -9223372036854775808, 1, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<4x4x?x?x1x1xf16>>, index, index, index, index) -> tensor<4x4x?x?x1x1xf16>
%57 = "tensor.empty"(%47) : (index) -> tensor<4x?x4x128xf16>
%58 = "scf.forall"(%47, %57) <{mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>], operandSegmentSizes = array<i32: 0, 1, 0, 1>, staticLowerBound = array<i64: 0, 0, 0>, staticStep = array<i64: 1, 64, 32>, staticUpperBound = array<i64: 4, -9223372036854775808, 128>}> ({
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: tensor<4x?x4x128xf16>):
%59 = "affine.min"(%arg1, %47) <{map = affine_map<(d0)[s0] -> (-d0 + s0, 64)>}> : (index, index) -> index
%60 = "tensor.empty"(%59) : (index) -> tensor<4x1x?x32xf16>
%61 = "tensor.empty"() : () -> tensor<4x1x1xf32>
%62 = "vector.transfer_write"(%7, %61, %15, %15, %15) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<4x1x1xf32>, tensor<4x1x1xf32>, index, index, index) -> tensor<4x1x1xf32>
%63 = "scf.for"(%15, %59, %13, %60) ({
^bb0(%arg8: index, %arg9: tensor<4x1x?x32xf16>):
%70 = "affine.apply"(%arg8, %arg1) <{map = affine_map<()[s0, s1] -> (s0 + s1)>}> : (index, index) -> index
%71 = "vector.transfer_read"(%53, %15, %arg0, %70, %15, %4) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 4, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (tensor<4x4x?x128xf16>, index, index, index, index, f16) -> vector<4x1x1x128xf16>
%72 = "arith.mulf"(%71, %1) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x128xf16>, vector<4x1x1x128xf16>) -> vector<4x1x1x128xf16>
%73 = "arith.extf"(%72) : (vector<4x1x1x128xf16>) -> vector<4x1x1x128xf32>
%74 = "scf.for"(%15, %12, %11, %arg9) ({
^bb0(%arg10: index, %arg11: tensor<4x1x?x32xf16>):
%75 = "affine.apply"(%arg10, %arg2) <{map = affine_map<()[s0, s1] -> (s0 + s1)>}> : (index, index) -> index
%76:3 = "scf.for"(%15, %47, %11, %62, %8, %6) ({
^bb0(%arg12: index, %arg13: tensor<4x1x1xf32>, %arg14: vector<4x1x1x8xf32>, %arg15: vector<4x1x1xf32>):
%83 = "affine.min"(%arg12, %47) <{map = affine_map<(d0)[s0] -> (-d0 + s0, 8)>}> : (index, index) -> index
%84 = "tensor.extract_slice"(%54, %arg0, %arg12, %83) <{operandSegmentSizes = array<i32: 1, 2, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 4, 1, -9223372036854775808, 1, 1, 128>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x1x1x128xf16>, index, index, index) -> tensor<4x1x?x1x1x128xf16>
%85 = "tensor.extract_slice"(%55, %arg0, %arg12, %75, %83) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, -9223372036854775808>, static_sizes = array<i64: 4, 1, -9223372036854775808, 1, 1, 8>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x1x1x128xf16>, index, index, index, index) -> tensor<4x1x?x1x1x8xf16>
%86 = "tensor.extract_slice"(%56, %arg0, %70, %arg12, %83) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808, 0, 0>, static_sizes = array<i64: 4, 1, 1, -9223372036854775808, 1, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (tensor<4x4x?x?x1x1xf16>, index, index, index, index) -> tensor<4x1x1x?x1x1xf16>
%87 = "tensor.empty"(%83) : (index) -> tensor<4x1x1x?x1x1xf32>
%88 = "vector.create_mask"(%9, %13, %13, %83, %13, %13) : (index, index, index, index, index, index) -> vector<4x1x1x8x1x1xi1>
%89 = "vector.transfer_write"(%3, %87, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
%90 = "vector.create_mask"(%9, %13, %83, %13, %13, %5) : (index, index, index, index, index, index) -> vector<4x1x8x1x1x128xi1>
%91 = "vector.transfer_read"(%84, %15, %15, %15, %15, %15, %15, %4, %90) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x?x1x1x128xf16>, index, index, index, index, index, index, f16, vector<4x1x8x1x1x128xi1>) -> vector<4x1x8x1x1x128xf16>
%92 = "vector.transfer_read"(%89, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%93 = "arith.extf"(%91) : (vector<4x1x8x1x1x128xf16>) -> vector<4x1x8x1x1x128xf32>
%94 = "vector.create_mask"(%9, %13, %13, %5, %83, %13, %13) : (index, index, index, index, index, index, index) -> vector<4x1x1x128x8x1x1xi1>
%95 = "vector.mask"(%94) ({
%135 = "vector.contract"(%73, %93, %92) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>], kind = #vector.kind<add>}> : (vector<4x1x1x128xf32>, vector<4x1x8x1x1x128xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
"vector.yield"(%135) : (vector<4x1x1x8x1x1xf32>) -> ()
}) : (vector<4x1x1x128x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%96 = "vector.transfer_write"(%95, %89, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
%97 = "vector.transfer_read"(%96, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%98 = "vector.transfer_write"(%97, %96, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
%99 = "vector.transfer_read"(%86, %15, %15, %15, %15, %15, %15, %4, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, f16, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf16>
%100 = "vector.transfer_read"(%98, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%101 = "arith.extf"(%99) : (vector<4x1x1x8x1x1xf16>) -> vector<4x1x1x8x1x1xf32>
%102 = "arith.mulf"(%101, %2) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
%103 = "arith.addf"(%100, %102) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
%104 = "vector.transfer_write"(%103, %98, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
%105 = "vector.transfer_read"(%104, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%106 = "vector.transfer_read"(%arg13, %15, %15, %15, %10) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (tensor<4x1x1xf32>, index, index, index, f32) -> vector<4x1x1xf32>
%107 = "vector.mask"(%88) ({
%134 = "vector.multi_reduction"(%105, %106) <{kind = #vector.kind<maximumf>, reduction_dims = array<i64: 3, 4, 5>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
"vector.yield"(%134) : (vector<4x1x1xf32>) -> ()
}) : (vector<4x1x1x8x1x1xi1>) -> vector<4x1x1xf32>
%108 = "vector.transfer_write"(%107, %arg13, %15, %15, %15) <{in_bounds = [true, true, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<4x1x1xf32>, tensor<4x1x1xf32>, index, index, index) -> tensor<4x1x1xf32>
%109 = "arith.subf"(%106, %107) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
%110 = "math.exp2"(%109) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>) -> vector<4x1x1xf32>
%111 = "arith.mulf"(%110, %arg15) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
%112 = "vector.broadcast"(%107) : (vector<4x1x1xf32>) -> vector<8x1x1x4x1x1xf32>
%113 = "vector.transpose"(%112) <{permutation = array<i64: 3, 4, 5, 0, 1, 2>}> : (vector<8x1x1x4x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
%114 = "arith.subf"(%105, %113) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
%115 = "math.exp2"(%114) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf32>
%116 = "vector.transfer_write"(%115, %104, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf32>, tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf32>
%117 = "vector.transfer_read"(%116, %15, %15, %15, %15, %15, %15, %10, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf32>, index, index, index, index, index, index, f32, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf32>
%118 = "vector.mask"(%88) ({
%133 = "vector.multi_reduction"(%117, %111) <{kind = #vector.kind<add>, reduction_dims = array<i64: 3, 4, 5>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x1xf32>) -> vector<4x1x1xf32>
"vector.yield"(%133) : (vector<4x1x1xf32>) -> ()
}) : (vector<4x1x1x8x1x1xi1>) -> vector<4x1x1xf32>
%119 = "tensor.empty"(%83) : (index) -> tensor<4x1x1x?x1x1xf16>
%120 = "arith.truncf"(%117) : (vector<4x1x1x8x1x1xf32>) -> vector<4x1x1x8x1x1xf16>
%121 = "vector.transfer_write"(%120, %119, %15, %15, %15, %15, %15, %15, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 6, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (vector<4x1x1x8x1x1xf16>, tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, vector<4x1x1x8x1x1xi1>) -> tensor<4x1x1x?x1x1xf16>
%122 = "vector.broadcast"(%110) : (vector<4x1x1xf32>) -> vector<8x4x1x1xf32>
%123 = "vector.transpose"(%122) <{permutation = array<i64: 1, 2, 3, 0>}> : (vector<8x4x1x1xf32>) -> vector<4x1x1x8xf32>
%124 = "arith.mulf"(%123, %arg14) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
%125 = "vector.transfer_read"(%121, %15, %15, %15, %15, %15, %15, %4, %88) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x1x?x1x1xf16>, index, index, index, index, index, index, f16, vector<4x1x1x8x1x1xi1>) -> vector<4x1x1x8x1x1xf16>
%126 = "vector.create_mask"(%9, %13, %83, %13, %13, %11) : (index, index, index, index, index, index) -> vector<4x1x8x1x1x8xi1>
%127 = "vector.transfer_read"(%85, %15, %15, %15, %15, %15, %15, %4, %126) <{in_bounds = [true, true, true, true, true, true], operandSegmentSizes = array<i32: 1, 6, 1, 1>, permutation_map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>}> : (tensor<4x1x?x1x1x8xf16>, index, index, index, index, index, index, f16, vector<4x1x8x1x1x8xi1>) -> vector<4x1x8x1x1x8xf16>
%128 = "arith.extf"(%125) : (vector<4x1x1x8x1x1xf16>) -> vector<4x1x1x8x1x1xf32>
%129 = "arith.extf"(%127) : (vector<4x1x8x1x1x8xf16>) -> vector<4x1x8x1x1x8xf32>
%130 = "vector.create_mask"(%9, %13, %13, %11, %83, %13, %13) : (index, index, index, index, index, index, index) -> vector<4x1x1x8x8x1x1xi1>
%131 = "vector.mask"(%130) ({
%132 = "vector.contract"(%128, %129, %124) <{indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5, d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>, #vector.iterator_type<reduction>, #vector.iterator_type<reduction>], kind = #vector.kind<add>}> : (vector<4x1x1x8x1x1xf32>, vector<4x1x8x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
"vector.yield"(%132) : (vector<4x1x1x8xf32>) -> ()
}) : (vector<4x1x1x8x8x1x1xi1>) -> vector<4x1x1x8xf32>
"scf.yield"(%108, %131, %118) : (tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>) -> ()
}) : (index, index, index, tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>) -> (tensor<4x1x1xf32>, vector<4x1x1x8xf32>, vector<4x1x1xf32>)
%77 = "vector.broadcast"(%76#2) : (vector<4x1x1xf32>) -> vector<8x4x1x1xf32>
%78 = "arith.divf"(%0, %77) <{fastmath = #arith.fastmath<none>}> : (vector<8x4x1x1xf32>, vector<8x4x1x1xf32>) -> vector<8x4x1x1xf32>
%79 = "vector.transpose"(%78) <{permutation = array<i64: 1, 2, 3, 0>}> : (vector<8x4x1x1xf32>) -> vector<4x1x1x8xf32>
%80 = "arith.mulf"(%79, %76#1) <{fastmath = #arith.fastmath<none>}> : (vector<4x1x1x8xf32>, vector<4x1x1x8xf32>) -> vector<4x1x1x8xf32>
%81 = "arith.truncf"(%80) : (vector<4x1x1x8xf32>) -> vector<4x1x1x8xf16>
%82 = "vector.transfer_write"(%81, %arg11, %15, %15, %arg8, %arg10) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 4, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (vector<4x1x1x8xf16>, tensor<4x1x?x32xf16>, index, index, index, index) -> tensor<4x1x?x32xf16>
"scf.yield"(%82) : (tensor<4x1x?x32xf16>) -> ()
}) : (index, index, index, tensor<4x1x?x32xf16>) -> tensor<4x1x?x32xf16>
"scf.yield"(%74) : (tensor<4x1x?x32xf16>) -> ()
}) : (index, index, index, tensor<4x1x?x32xf16>) -> tensor<4x1x?x32xf16>
%64 = "tensor.extract_slice"(%arg3, %arg1, %arg0, %arg2, %59) <{operandSegmentSizes = array<i32: 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 4, -9223372036854775808, 1, 32>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x4x128xf16>, index, index, index, index) -> tensor<4x?x1x32xf16>
%65 = "scf.for"(%15, %59, %13, %64) ({
^bb0(%arg4: index, %arg5: tensor<4x?x1x32xf16>):
%66 = "scf.for"(%15, %12, %11, %arg5) ({
^bb0(%arg6: index, %arg7: tensor<4x?x1x32xf16>):
%67 = "vector.transfer_read"(%63, %15, %15, %arg4, %arg6, %4) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 4, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (tensor<4x1x?x32xf16>, index, index, index, index, f16) -> vector<4x1x1x8xf16>
%68 = "vector.transpose"(%67) <{permutation = array<i64: 0, 2, 1, 3>}> : (vector<4x1x1x8xf16>) -> vector<4x1x1x8xf16>
%69 = "vector.transfer_write"(%68, %arg7, %15, %arg4, %15, %arg6) <{in_bounds = [true, true, true, true], operandSegmentSizes = array<i32: 1, 1, 4, 0>, permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>}> : (vector<4x1x1x8xf16>, tensor<4x?x1x32xf16>, index, index, index, index) -> tensor<4x?x1x32xf16>
"scf.yield"(%69) : (tensor<4x?x1x32xf16>) -> ()
}) : (index, index, index, tensor<4x?x1x32xf16>) -> tensor<4x?x1x32xf16>
"scf.yield"(%66) : (tensor<4x?x1x32xf16>) -> ()
}) : (index, index, index, tensor<4x?x1x32xf16>) -> tensor<4x?x1x32xf16>
"scf.forall.in_parallel"() ({
"tensor.parallel_insert_slice"(%65, %arg3, %arg1, %arg0, %arg2, %59) <{operandSegmentSizes = array<i32: 1, 1, 3, 1, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 4, -9223372036854775808, 1, 32>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x1x32xf16>, tensor<4x?x4x128xf16>, index, index, index, index) -> ()
}) : () -> ()
}) : (index, tensor<4x?x4x128xf16>) -> tensor<4x?x4x128xf16>
"flow.dispatch.tensor.store"(%58, %52, %47, %47) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 1, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 4, -9223372036854775808, 4, 128>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<4x?x4x128xf16>, !flow.dispatch.tensor<writeonly:tensor<4x?x4x128xf16>>, index, index) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = CPULinalgExtTileAndVectorize>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment