Created
December 5, 2024 14:40
-
-
Save pashu123/571e68b2fba831af7d2619f01d1c2a36 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0) -> (d0)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_local]} { | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0) -> (d0)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0) -> (d0)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0) -> (d0)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
#map = affine_map<(d0) -> (d0)> | |
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_local | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After ExpandF16OpToF32Pass (iree-llvmcpu-expand-f16-op-to-f32) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After CPUMaterializeDeviceEncodingPass (iree-codegen-cpu-materialize-device-encoding) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After EraseHALDescriptorTypeFromMemRefPass (iree-codegen-erase-hal-descriptor-type-from-memref) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUSelectLoweringStrategyPass (iree-llvmcpu-select-lowering-strategy) //----- // | |
module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice, %5 : tensor<384xf32>, tensor<384xf32>) outs(%4 : tensor<384xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_0: f32, %out: f32): | |
%8 = arith.addf %in, %in_0 : f32 | |
linalg.yield %8 : f32 | |
} -> tensor<384xf32> | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUSplitReductionPass (iree-llvmcpu-split-reduction) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After FuseTensorPadWithConsumerPass (iree-codegen-fuse-tensor-pad-with-consumer) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After ConcretizePadResultShapePass (iree-codegen-concretize-pad-result-shape) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After TensorToVectorVectorizePadPass (iree-codegen-vectorize-tensor-pad) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = tensor.empty() : tensor<1x50x384xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384, 0, 0], [16, 0, 0], [0, 0, 0], [0, 0, 0]]>} : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %unpack[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%7 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<16xf32>, tensor<16xf32>) outs(%extracted_slice_2 : tensor<16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[384], [16], [0], [0]]>} { | |
^bb0(%in: f32, %in_3: f32, %out: f32): | |
%9 = arith.addf %in, %in_3 : f32 | |
linalg.yield %9 : f32 | |
} -> tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %8 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%extracted_slice = tensor.extract_slice %10[0, 0, 0] [1, 1, 384] [1, 1, 1] : tensor<1x50x384xf32> to tensor<384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_1 = tensor.extract_slice %5[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%extracted_slice_2 = tensor.extract_slice %arg1[%arg0] [16] [1] : tensor<384xf32> to tensor<16xf32> | |
%12 = vector.transfer_read %extracted_slice_0[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32> | |
%13 = vector.transfer_read %extracted_slice_1[%c0], %cst {in_bounds = [true]} : tensor<16xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %extracted_slice_2[%c0] {in_bounds = [true]} : vector<16xf32>, tensor<16xf32> | |
%inserted_slice = tensor.insert_slice %15 into %arg1[%arg0] [16] [1] : tensor<16xf32> into tensor<384xf32> | |
scf.yield %inserted_slice : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
module_main_graph$async_dispatch_2.mlir:9:7: error: One or more operations with large vector sizes (32768 bytes) were found: | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() { | |
^ | |
module_main_graph$async_dispatch_2.mlir:19:19: note: %6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%unpack = tensor.unpack %3 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %6 : tensor<1x4x24x16x16xf32> -> tensor<1x50x384xf32> | |
^ | |
module_main_graph$async_dispatch_2.mlir:19:19: note: %7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
module_main_graph$async_dispatch_2.mlir:19:19: note: %8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
module_main_graph$async_dispatch_2.mlir:19:19: note: %10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
// -----// IR Dump After LLVMCPUVerifyVectorSizeLegalityPass Failed (iree-llvmcpu-verify-vector-size-legality) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
// -----// IR Dump After LLVMCPULowerExecutableTargetPass Failed (iree-llvmcpu-lower-executable-target) //----- // | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
module_main_graph$async_dispatch_2.mlir:2:3: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
^ | |
module_main_graph$async_dispatch_2.mlir:2:3: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg2: !hal.device): | |
%21:3 = "flow.dispatch.workgroup_count_from_slice"() : () -> (index, index, index) | |
"hal.return"(%21#0, %21#1, %21#2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, ordinal = 0 : index, sym_name = "main_graph$async_dispatch_2_unpack_elementwise_384_f32"} : () -> () | |
"builtin.module"() ({ | |
"func.func"() <{function_type = () -> (), sym_name = "main_graph$async_dispatch_2_unpack_elementwise_384_f32"}> ({ | |
%0 = "arith.constant"() <{value = 0.000000e+00 : f32}> : () -> f32 | |
%1 = "arith.constant"() <{value = 16 : index}> : () -> index | |
%2 = "arith.constant"() <{value = 384 : index}> : () -> index | |
%3 = "arith.constant"() <{value = 0 : index}> : () -> index | |
%4 = "arith.constant"() <{value = 98304 : index}> : () -> index | |
%5 = "hal.interface.binding.subspan"(%3) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%6 = "hal.interface.binding.subspan"(%4) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%7 = "hal.interface.binding.subspan"(%3) {alignment = 64 : index, binding = 2 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>, operandSegmentSizes = array<i32: 1, 0>} : (index) -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%8 = "flow.dispatch.tensor.load"(%6) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0>, static_sizes = array<i64: 1, 4, 24, 16, 16>, static_strides = array<i64: 1, 1, 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>>) -> tensor<1x4x24x16x16xf32> | |
%9 = "tensor.empty"() : () -> tensor<384xf32> | |
%10 = "flow.dispatch.tensor.load"(%5) <{operandSegmentSizes = array<i32: 1, 0, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0>, static_sizes = array<i64: 1, 1, 384>, static_strides = array<i64: 1, 1, 1>}> : (!flow.dispatch.tensor<readonly:tensor<1x50x384xf32>>) -> tensor<384xf32> | |
%11 = "vector.transfer_read"(%8, %3, %3, %3, %3, %3, %0) <{in_bounds = [true, true, true, true, true], operandSegmentSizes = array<i32: 1, 5, 1, 0>, permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>}> : (tensor<1x4x24x16x16xf32>, index, index, index, index, index, f32) -> vector<1x4x24x16x16xf32> | |
%12 = "vector.transpose"(%11) <{permutation = array<i64: 0, 1, 3, 2, 4>}> : (vector<1x4x24x16x16xf32>) -> vector<1x4x16x24x16xf32> | |
%13 = "vector.shape_cast"(%12) : (vector<1x4x16x24x16xf32>) -> vector<1x64x384xf32> | |
%14 = "tensor.empty"() : () -> tensor<1x50x384xf32> | |
%15 = "vector.transfer_write"(%13, %14, %3, %3, %3) <{in_bounds = [true, false, true], operandSegmentSizes = array<i32: 1, 1, 3, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (vector<1x64x384xf32>, tensor<1x50x384xf32>, index, index, index) -> tensor<1x50x384xf32> | |
%16 = "scf.for"(%3, %2, %1, %9) ({ | |
^bb0(%arg0: index, %arg1: tensor<384xf32>): | |
%17 = "vector.transfer_read"(%15, %3, %3, %arg0, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 3, 1, 0>, permutation_map = affine_map<(d0, d1, d2) -> (d2)>}> : (tensor<1x50x384xf32>, index, index, index, f32) -> vector<16xf32> | |
%18 = "vector.transfer_read"(%10, %arg0, %0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (tensor<384xf32>, index, f32) -> vector<16xf32> | |
%19 = "arith.addf"(%17, %18) <{denormal = #arith.denormal<ieee>, fastmath = #arith.fastmath<none>}> : (vector<16xf32>, vector<16xf32>) -> vector<16xf32> | |
%20 = "vector.transfer_write"(%19, %arg1, %arg0) <{in_bounds = [true], operandSegmentSizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>}> : (vector<16xf32>, tensor<384xf32>, index) -> tensor<384xf32> | |
"scf.yield"(%20) : (tensor<384xf32>) -> () | |
}) : (index, index, index, tensor<384xf32>) -> tensor<384xf32> | |
"flow.dispatch.tensor.store"(%16, %7) <{operandSegmentSizes = array<i32: 1, 1, 0, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 384>, static_strides = array<i64: 1>}> : (tensor<384xf32>, !flow.dispatch.tensor<writeonly:tensor<384xf32>>) -> () | |
"func.return"() : () -> () | |
}) {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} : () -> () | |
}) : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "embedded_elf_x86_64", sym_visibility = "public", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> () | |
// -----// IR Dump After TranslateTargetExecutableVariantsPass Failed (iree-hal-translate-target-executable-variants) //----- // | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
failed to translate executables | |
// -----// IR Dump After TranslateAllExecutablesPass Failed (iree-hal-translate-all-executables) //----- // | |
hal.executable public @main_graph$async_dispatch_2 { | |
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-amx-fp8,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,+sse4.2,-tsxldtrk,-sm3,-ptwrite,-widekl,-movrs,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-ccmp,-amx-int8,-kl,-avx10.1-256,+evex512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,-sha512,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-avx10.2-256,+gfni,-avxvnniint16,-amx-fp16,-zu,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,-nf,-amx-tf32,-amx-avx512,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-amx-transpose,-avx10.2-512,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,-amx-movrs,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) { | |
hal.executable.export public @main_graph$async_dispatch_2_unpack_elementwise_384_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @main_graph$async_dispatch_2_unpack_elementwise_384_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} { | |
%cst = arith.constant 0.000000e+00 : f32 | |
%c16 = arith.constant 16 : index | |
%c384 = arith.constant 384 : index | |
%c0 = arith.constant 0 : index | |
%c98304 = arith.constant 98304 : index | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c98304) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
%3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 4, 24, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x24x16x16xf32>> -> tensor<1x4x24x16x16xf32> | |
%4 = tensor.empty() : tensor<384xf32> | |
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x50x384xf32>> -> tensor<384xf32> | |
%6 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x24x16x16xf32>, vector<1x4x24x16x16xf32> | |
%7 = vector.transpose %6, [0, 1, 3, 2, 4] : vector<1x4x24x16x16xf32> to vector<1x4x16x24x16xf32> | |
%8 = vector.shape_cast %7 : vector<1x4x16x24x16xf32> to vector<1x64x384xf32> | |
%9 = tensor.empty() : tensor<1x50x384xf32> | |
%10 = vector.transfer_write %8, %9[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x64x384xf32>, tensor<1x50x384xf32> | |
%11 = scf.for %arg0 = %c0 to %c384 step %c16 iter_args(%arg1 = %4) -> (tensor<384xf32>) { | |
%12 = vector.transfer_read %10[%c0, %c0, %arg0], %cst {in_bounds = [true]} : tensor<1x50x384xf32>, vector<16xf32> | |
%13 = vector.transfer_read %5[%arg0], %cst {in_bounds = [true]} : tensor<384xf32>, vector<16xf32> | |
%14 = arith.addf %12, %13 : vector<16xf32> | |
%15 = vector.transfer_write %14, %arg1[%arg0] {in_bounds = [true]} : vector<16xf32>, tensor<384xf32> | |
scf.yield %15 : tensor<384xf32> | |
} | |
flow.dispatch.tensor.store %11, %2, offsets = [0], sizes = [384], strides = [1] : tensor<384xf32> -> !flow.dispatch.tensor<writeonly:tensor<384xf32>> | |
return | |
} | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment