Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created December 5, 2024 14:40
Show Gist options
  • Save pashu123/571e68b2fba831af7d2619f01d1c2a36 to your computer and use it in GitHub Desktop.
Save pashu123/571e68b2fba831af7d2619f01d1c2a36 to your computer and use it in GitHub Desktop.
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {hal.device.targets = [#device_target_local]} {
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0) -> (d0)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
}
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- //
module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.addf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<384xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPUSplitReductionPass (iree-llvmcpu-split-reduction) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = tensor.empty() : tensor<1x50x384xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} {
^bb0(%in: f32, %in_3: f32, %out: f32):
%9 = arith.addf %in, %in_3 : f32
linalg.yield %9 : f32
} -> tensor<16xf32>
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32>
%12 = vector.transfer_read %extracted_slice_0[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32>
%13 = vector.transfer_read %extracted_slice_1[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %extracted_slice_2[%c0] {in_bounds = [true]} : vector<16xf32>, tensor<16xf32>
%inserted_slice = tensor.insert_slice %15 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32>
scf.yield %inserted_slice : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
module_main_graph$async_dispatch_2.mlir:9:7: error: One or more operations with large vector sizes (32768 bytes) were found:
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() {
^
module_main_graph$async_dispatch_2.mlir:19:19: note: %6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32>
^
module_main_graph$async_dispatch_2.mlir:19:19: note: %7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
module_main_graph$async_dispatch_2.mlir:19:19: note: %8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
module_main_graph$async_dispatch_2.mlir:19:19: note: %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass Failed (iree-llvmcpu-verify-vector-size-legality) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
// -----// IR Dump After LLVMCPULowerExecutableTargetPass Failed (iree-llvmcpu-lower-executable-target) //----- //
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
module_main_graph$async_dispatch_2.mlir:2:3: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
^
module_main_graph$async_dispatch_2.mlir:2:3: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg2: !hal.device):
%21:3 = "flow.dispatch.workgroup_count_from_slice"() : () -> (index, index, index)
"hal.return"(%21#0, %21#1, %21#2) : (index, index, index) -> ()
}) {layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph$async_dispatch_2_unpack_elementwise_384_f32"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "main_graph$async_dispatch_2_unpack_elementwise_384_f32"}> ({
%0 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32
%1 = "arith.constant"() <{value = 16 : index}> : () -> index
%2 = "arith.constant"() <{value = 384 : index}> : () -> index
%3 = "arith.constant"() <{value = 0 : index}> : () -> index
%4 = "arith.constant"() <{value = 98304 : index}> : () -> index
%5 = "hal.interface.binding.subspan"(%3) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%6 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%7 = "hal.interface.binding.subspan"(%3) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%8 = "flow.dispatch.tensor.load"(%6) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 4, 24, 16, 16>, static_strides = array<i64: 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>) -> tensor<1x4x24x16x16xf32>
%9 = "tensor.empty"() : () -> tensor<384xf32>
%10 = "flow.dispatch.tensor.load"(%5) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0>, static_sizes = array<i64: 1, 1, 384>, static_strides = array<i64: 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>) -> tensor<384xf32>
%11 = "vector.transfer_read"(%8, %3, %3, %3, %3, %3, %0) <{in_bounds = [true, true, true, true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> : (tensor<1x4x24x16x16xf32>, index, index, index, index, index, f32) -> vector<1x4x24x16x16xf32>
%12 = "vector.transpose"(%11) <{permutation = array<i64: 0, 1, 3, 2, 4>}> : (vector<1x4x24x16x16xf32>) -> vector<1x4x16x24x16xf32>
%13 = "vector.shape_cast"(%12) : (vector<1x4x16x24x16xf32>) -> vector<1x64x384xf32>
%14 = "tensor.empty"() : () -> tensor<1x50x384xf32>
%15 = "vector.transfer_write"(%13, %14, %3, %3, %3) <{in_bounds = [true, false, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<1x64x384xf32>, tensor<1x50x384xf32>, index, index, index) -> tensor<1x50x384xf32>
%16 = "scf.for"(%3, %2, %1, %9) ({
^bb0(%arg0: index, %arg1: tensor<384xf32>):
%17 = "vector.transfer_read"(%15, %3, %3, %arg0, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d2)>}> : (tensor<1x50x384xf32>, index, index, index, f32) -> vector<16xf32>
%18 = "vector.transfer_read"(%10, %arg0, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (tensor<384xf32>, index, f32) -> vector<16xf32>
%19 = "arith.addf"(%17, %18) <{denormal = #arith.denormal<ieee>, fastmath = #arith.fastmath<none>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32>
%20 = "vector.transfer_write"(%19, %arg1, %arg0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (vector<16xf32>, tensor<384xf32>, index) -> tensor<384xf32>
"scf.yield"(%20) : (tensor<384xf32>) -> ()
}) : (index, index, index, tensor<384xf32>) -> tensor<384xf32>
"flow.dispatch.tensor.store"(%16, %7) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 384>, static_strides = array<i64: 1>}> : (tensor<384xf32>, !flow.dispatch.tensor<writeonly:tensor<384xf32>>) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x86_64", sym_visibility = "public", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
// -----// IR Dump After TranslateTargetExecutableVariantsPass Failed (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
failed to translate executables
// -----// IR Dump After TranslateAllExecutablesPass Failed (iree-hal-translate-all-executables) //----- //
hal.executable public @main_graph$async_dispatch_2 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f32
%c16 = arith.constant 16 : index
%c384 = arith.constant 384 : index
%c0 = arith.constant 0 : index
%c98304 = arith.constant 98304 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>>
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32>
%4 = tensor.empty() : tensor<384xf32>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32>
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32>
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32>
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32>
%9 = tensor.empty() : tensor<1x50x384xf32>
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32>
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) {
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32>
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32>
%14 = arith.addf %12, %13 : vector<16xf32>
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32>
scf.yield %15 : tensor<384xf32>
}
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>>
return
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment