Skip to content

Instantly share code, notes, and snippets.

@bjacob
Last active October 7, 2024 15:16
Show Gist options
  • Save bjacob/a493b64ba99f8820448cdfcd13f188f6 to your computer and use it in GitHub Desktop.
Save bjacob/a493b64ba99f8820448cdfcd13f188f6 to your computer and use it in GitHub Desktop.
Bad codegen for `vector<8xi8>` operands to MFMA intrinsics
tools/iree-compile /tmp/x/module_foo_dispatch_6.mlir --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-opt-data-tiling --iree-global-opt-experimental-rocm-data-tiling --iree-global-opt-enable-early-materialization=true -o /tmp/a.vmfb --compile-from=executable-sources -mlir-disable-threading -mlir-print-ir-after-all 2>/tmp/log
This file has been truncated, but you can view the full file.
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {hal.device.targets = [#device_target_hip]} {
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
}
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
}
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
}
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
}
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_hip
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
}
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
module {
func.func @foo_dispatch_6() {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
module {
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable public @foo_dispatch_6 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) {
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = flow.dispatch.workload.ordinal %37, 0 : index
%64 = flow.dispatch.workload.ordinal %42, 1 : index
%65 = flow.dispatch.workload.ordinal %47, 2 : index
%66 = flow.dispatch.workload.ordinal %52, 3 : index
%67 = flow.dispatch.workload.ordinal %57, 4 : index
%68 = flow.dispatch.workload.ordinal %62, 5 : index
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64}
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66}
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8>
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8>
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32>
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68}
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
%70 = arith.extui %14 : i32 to i64
%71 = arith.extui %15 : i32 to i64
%72 = arith.shli %71, %c32_i64 : i64
%73 = arith.ori %70, %72 : i64
%74 = arith.index_castui %73 : i64 to index
%75 = arith.extui %16 : i32 to i64
%76 = arith.extui %17 : i32 to i64
%77 = arith.shli %76, %c32_i64 : i64
%78 = arith.ori %75, %77 : i64
%79 = arith.index_castui %78 : i64 to index
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%74, %79}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%c1 = arith.constant 1 : index
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
%c1_0 = arith.constant 1 : index
%dim_1 = tensor.dim %67, %c1_0 : tensor<1x?x4x2x4x16x2x8xi8>
%71 = tensor.empty(%dim_1) : tensor<1x?x4x2x4x16x2x8xi8>
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After PackToIntrinsicsPass (iree-gpu-pack-to-intrinsics) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
%dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8>
%71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8>
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
%dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8>
%71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8>
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8>
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8>
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8>
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8>
%dim_0 = tensor.dim %70, %c1 : tensor<1x?x8x4x16x2x8xi8>
%71 = scf.for %arg0 = %c0 to %dim_0 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%72 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%74 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%75 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%74 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%76 = iree_gpu.multi_mma %73, %75, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %76 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8>
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8>
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After DistributeMmaToLanesPass (iree-gpu-distribute-mma-to-lanes) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index
%extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
%77 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
%78 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
%79 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
%80 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
%81 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
%82 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%80, %arg0]
%extracted_slice = tensor.extract_slice %66[%81, %82, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %83 into %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%77 = affine.apply affine_map<(d0) -> (d0)>(%arg7)
%78 = affine.apply affine_map<(d0) -> (d0)>(%arg6)
%79 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
%80 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
%81 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
%82 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
%83 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%81, %arg0]
%extracted_slice = tensor.extract_slice %67[%82, %83, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%84 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %84 into %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index
%extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
%extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%80:3 = affine.delinearize_index %79 into (%c4, %c4, %c16) : index, index, index
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %81 into %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
%extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
%extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8>
%70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9)
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8)
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0]
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
}
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2)
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index
%extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2)
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32>
}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After FuseAndHoistParallelLoopsPass (iree-gpu-fuse-and-hoist-parallel-loops) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:7 = affine.delinearize_index %81 into (%c1, %c1, %c8, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index
%83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#6)
%84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#5)
%85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2]
%extracted_slice_2 = tensor.extract_slice %66[%82#0, %85, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:8 = affine.delinearize_index %81 into (%c1, %c1, %c4, %c2, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index, index
%83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#7)
%84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#6)
%85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2]
%extracted_slice_2 = tensor.extract_slice %67[%82#0, %85, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x4x2x4x16x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x4x2x4x16x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x4x2x4x16x2x8xi8>
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>
%78 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>):
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0)
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %78[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CombineBarrierRegionsPass (iree-gpu-combine-barrier-regions) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%79 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%80 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %79, %80 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %78 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After VectorizeIREEGPUOpsPass (iree-gpu-vectorize-ops) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) {
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8>
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8>
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32>
}
scf.forall.in_parallel {
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After GPUInferMemorySpacePass (iree-codegen-gpu-infer-memory-space) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42}
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52}
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8>
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8>
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32>
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8>
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8>
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) {
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32>
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32>
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) {
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) {
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>):
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8>
} {unroll_loop}
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) {
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0)
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8>
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8>
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8>
} {unroll_loop}
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8>
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8>
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32>
}
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32>
}
} {mapping = [#gpu.thread<linear_dim_0>]}
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62}
return
}
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
%72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
%subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
} {unroll_loop}
%73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
%72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
%subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
} {unroll_loop}
%73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_6 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_6 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_7 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
memref.copy %subview_3, %subview_3 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
memref.copy %subview_1, %subview_1 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
return
}
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
return
}
// -----// IR Dump After GPUVerifyDistributionPass (iree-codegen-gpu-verify-distribution) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.forall (%arg0) in (256) {
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0)
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0)
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0)
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
} {mapping = [#gpu.thread<linear_dim_0>]}
return
}
// -----// IR Dump After GPUDistributeForallPass (iree-codegen-gpu-distribute-forall) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%c0_i32 = arith.constant 0 : i32
%c0_i8 = arith.constant 0 : i8
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c512 = arith.constant 512 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32_i64 = arith.constant 32 : i64
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%18 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%19 = arith.extui %1 : i32 to i64
%20 = arith.extui %2 : i32 to i64
%21 = arith.shli %20, %c32_i64 : i64
%22 = arith.ori %19, %21 : i64
%23 = arith.index_castui %22 : i64 to index
%24 = arith.extui %3 : i32 to i64
%25 = arith.extui %4 : i32 to i64
%26 = arith.shli %25, %c32_i64 : i64
%27 = arith.ori %24, %26 : i64
%28 = arith.index_castui %27 : i64 to index
%29 = arith.extui %5 : i32 to i64
%30 = arith.extui %6 : i32 to i64
%31 = arith.shli %30, %c32_i64 : i64
%32 = arith.ori %29, %31 : i64
%33 = arith.index_castui %32 : i64 to index
%34 = arith.extui %7 : i32 to i64
%35 = arith.extui %8 : i32 to i64
%36 = arith.shli %35, %c32_i64 : i64
%37 = arith.ori %34, %36 : i64
%38 = arith.index_castui %37 : i64 to index
%39 = arith.extui %9 : i32 to i64
%40 = arith.extui %10 : i32 to i64
%41 = arith.shli %40, %c32_i64 : i64
%42 = arith.ori %39, %41 : i64
%43 = arith.index_castui %42 : i64 to index
%44 = arith.extui %11 : i32 to i64
%45 = arith.extui %12 : i32 to i64
%46 = arith.shli %45, %c32_i64 : i64
%47 = arith.ori %44, %46 : i64
%48 = arith.index_castui %47 : i64 to index
%49 = arith.extui %13 : i32 to i64
%50 = arith.extui %14 : i32 to i64
%51 = arith.shli %50, %c32_i64 : i64
%52 = arith.ori %49, %51 : i64
%53 = arith.index_castui %52 : i64 to index
%54 = arith.extui %15 : i32 to i64
%55 = arith.extui %16 : i32 to i64
%56 = arith.shli %55, %c32_i64 : i64
%57 = arith.ori %54, %56 : i64
%58 = arith.index_castui %57 : i64 to index
%59 = arith.extui %17 : i32 to i64
%60 = arith.extui %18 : i32 to i64
%61 = arith.shli %60, %c32_i64 : i64
%62 = arith.ori %59, %61 : i64
%63 = arith.index_castui %62 : i64 to index
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%23) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%38, %43}
memref.assume_alignment %64, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%28) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%48, %53}
memref.assume_alignment %65, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%66 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%33) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%58, %63}
memref.assume_alignment %66, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %64[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %43, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %65[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %43, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %66[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
%c256_3 = arith.constant 256 : index
%c0_4 = arith.constant 0 : index
%c256_5 = arith.constant 256 : index
%c256_6 = arith.constant 256 : index
scf.for %arg0 = %c0_4 to %c256_5 step %c256_6 {
%67 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
%68 = affine.delinearize_index %67 into (%c256_3) : index
%69 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%68)
%70:3 = affine.delinearize_index %69 into (%c4, %c4, %c16) : index, index, index
%subview_7 = memref.subview %subview_1[0, 0, 0, %70#0, 0, %70#1, %70#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%71 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%68)
%72:2 = affine.delinearize_index %71 into (%c4, %c16) : index, index
%73 = vector.transfer_read %subview_1[%c0, %c0, %c0, %70#0, %c0, %70#1, %70#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%74 = scf.for %arg1 = %c0 to %43 step %c1 iter_args(%arg2 = %73) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68)
%79:3 = affine.delinearize_index %78 into (%c8, %c4, %c16) : index, index, index
%subview_8 = memref.subview %subview[0, %arg1, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %alloc[0, 0, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68)
%79:4 = affine.delinearize_index %78 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_8 = memref.subview %subview_0[0, %arg1, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_9 = memref.subview %alloc_2[0, 0, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%75 = vector.transfer_read %alloc[%c0, %c0, %c0, %72#0, %72#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%76 = vector.transfer_read %alloc_2[%c0, %c0, %70#0, %c0, %70#1, %70#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%77 = iree_gpu.multi_mma %75, %76, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %77 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %74, %subview_7[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
return
}
// -----// IR Dump After VectorizeMemrefCopyPass (iree-codegen-vectorize-memref-copy) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c2 = arith.constant 2 : index
%c0_i8 = arith.constant 0 : i8
%c0_i32 = arith.constant 0 : i32
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.for %arg0 = %c0 to %c256 step %c256 {
%66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
%67 = affine.delinearize_index %66 into (%c256) : index
%68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67)
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67)
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32>
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
return
}
// -----// IR Dump After UnrollToIntrinsicsPass (iree-gpu-unroll-to-intrinsics) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c2 = arith.constant 2 : index
%c0_i8 = arith.constant 0 : i8
%c0_i32 = arith.constant 0 : i32
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
scf.for %arg0 = %c0 to %c256 step %c256 {
%66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z]
%67 = affine.delinearize_index %66 into (%c256) : index
%68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67)
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67)
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg3 = %c0 to %c512 step %c256 {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg1, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg3 = %c0 to %c512 step %c256 {
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67)
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg1, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8>
%77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8>
%78 = vector.extract %arg2[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32>
%79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32>
%80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32>
scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c2 = arith.constant 2 : index
%c0_i8 = arith.constant 0 : i8
%c0_i32 = arith.constant 0 : i32
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>
%66 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z]
%67 = affine.delinearize_index %66 into (%c256) : index
%68 = affine.apply affine_map<()[s0] -> (s0 mod 256)>()[%67]
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%70 = affine.apply affine_map<()[s0] -> (s0 mod 64)>()[%67]
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32>
%73 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) {
gpu.barrier
scf.for %arg2 = %c0 to %c512 step %c256 {
%81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67]
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index
%subview_4 = memref.subview %subview[0, %arg0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8>
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
scf.for %arg2 = %c0 to %c512 step %c256 {
%81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67]
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index
%subview_4 = memref.subview %subview_0[0, %arg0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8>
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>>
} {unroll_loop}
gpu.barrier
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8>
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8>
%76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8>
%77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8>
%78 = vector.extract %arg1[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32>
%79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32>
%80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32>
scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32>
}
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} {
%c32_i64 = arith.constant 32 : i64
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c512 = arith.constant 512 : index
%c256 = arith.constant 256 : index
%c2 = arith.constant 2 : index
%c0_i8 = arith.constant 0 : i8
%c0_i32 = arith.constant 0 : i32
%thread_id_x = gpu.thread_id x
%thread_id_y = gpu.thread_id y
%thread_id_z = gpu.thread_id z
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32
%18 = arith.extui %0 : i32 to i64
%19 = arith.extui %1 : i32 to i64
%20 = arith.shli %19, %c32_i64 : i64
%21 = arith.ori %18, %20 : i64
%22 = arith.index_castui %21 : i64 to index
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %4 : i32 to i64
%29 = arith.extui %5 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = arith.extui %6 : i32 to i64
%34 = arith.extui %7 : i32 to i64
%35 = arith.shli %34, %c32_i64 : i64
%36 = arith.ori %33, %35 : i64
%37 = arith.index_castui %36 : i64 to index
%38 = arith.extui %8 : i32 to i64
%39 = arith.extui %9 : i32 to i64
%40 = arith.shli %39, %c32_i64 : i64
%41 = arith.ori %38, %40 : i64
%42 = arith.index_castui %41 : i64 to index
%43 = arith.extui %10 : i32 to i64
%44 = arith.extui %11 : i32 to i64
%45 = arith.shli %44, %c32_i64 : i64
%46 = arith.ori %43, %45 : i64
%47 = arith.index_castui %46 : i64 to index
%48 = arith.extui %12 : i32 to i64
%49 = arith.extui %13 : i32 to i64
%50 = arith.shli %49, %c32_i64 : i64
%51 = arith.ori %48, %50 : i64
%52 = arith.index_castui %51 : i64 to index
%53 = arith.extui %14 : i32 to i64
%54 = arith.extui %15 : i32 to i64
%55 = arith.shli %54, %c32_i64 : i64
%56 = arith.ori %53, %55 : i64
%57 = arith.index_castui %56 : i64 to index
%58 = arith.extui %16 : i32 to i64
%59 = arith.extui %17 : i32 to i64
%60 = arith.shli %59, %c32_i64 : i64
%61 = arith.ori %58, %60 : i64
%62 = arith.index_castui %61 : i64 to index
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42}
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52}
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62}
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment