tools/iree-compile /tmp/x/module_foo_dispatch_6.mlir --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-opt-data-tiling --iree-global-opt-experimental-rocm-data-tiling --iree-global-opt-enable-early-materialization=true -o /tmp/a.vmfb --compile-from=executable-sources -mlir-disable-threading -mlir-print-ir-after-all 2>/tmp/log
Last active
October 7, 2024 15:16
-
-
Save bjacob/a493b64ba99f8820448cdfcd13f188f6 to your computer and use it in GitHub Desktop.
Bad codegen for `vector<8xi8>` operands to MFMA intrinsics
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#pipeline_layout = #hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#pipeline_layout) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(#pipeline_layout) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(#pipeline_layout) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(#pipeline_layout) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(#pipeline_layout) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [#map, #map1, #map2], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
} | |
// -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
// -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- // | |
module { | |
func.func @foo_dispatch_6() { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
// -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- // | |
module { | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- // | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- // | |
hal.executable public @foo_dispatch_6 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) { | |
hal.executable.export public @foo_dispatch_6 ordinal(0) layout(#hal.pipeline.layout<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) { | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index): | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 | |
hal.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
} | |
} | |
// -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- // | |
module { | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = flow.dispatch.workload.ordinal %37, 0 : index | |
%64 = flow.dispatch.workload.ordinal %42, 1 : index | |
%65 = flow.dispatch.workload.ordinal %47, 2 : index | |
%66 = flow.dispatch.workload.ordinal %52, 3 : index | |
%67 = flow.dispatch.workload.ordinal %57, 4 : index | |
%68 = flow.dispatch.workload.ordinal %62, 5 : index | |
%69 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} | |
%70 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} | |
%71 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
%72 = flow.dispatch.tensor.load %69, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%63, %64, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%63, %64} -> tensor<?x?x8x4x16x2x8xi8> | |
%73 = flow.dispatch.tensor.load %70, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%65, %66, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%65, %66} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%74 = flow.dispatch.tensor.load %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%75 = iree_gpu.multi_mma %72, %73, %74 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %75, %71, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%67, %68, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%67, %68} | |
return | |
} | |
} | |
// -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
%70 = arith.extui %14 : i32 to i64 | |
%71 = arith.extui %15 : i32 to i64 | |
%72 = arith.shli %71, %c32_i64 : i64 | |
%73 = arith.ori %70, %72 : i64 | |
%74 = arith.index_castui %73 : i64 to index | |
%75 = arith.extui %16 : i32 to i64 | |
%76 = arith.extui %17 : i32 to i64 | |
%77 = arith.shli %76, %c32_i64 : i64 | |
%78 = arith.ori %75, %77 : i64 | |
%79 = arith.index_castui %78 : i64 to index | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%74, %79} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = iree_gpu.multi_mma %66, %67, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%c1 = arith.constant 1 : index | |
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8> | |
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8> | |
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8> | |
%c1_0 = arith.constant 1 : index | |
%dim_1 = tensor.dim %67, %c1_0 : tensor<1x?x4x2x4x16x2x8xi8> | |
%71 = tensor.empty(%dim_1) : tensor<1x?x4x2x4x16x2x8xi8> | |
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8> | |
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After PackToIntrinsicsPass (iree-gpu-pack-to-intrinsics) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8> | |
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8> | |
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8> | |
%dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8> | |
%71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8> | |
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8> | |
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8> | |
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8> | |
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8> | |
%dim_0 = tensor.dim %67, %c1 : tensor<1x?x4x2x4x16x2x8xi8> | |
%71 = tensor.empty(%dim_0) : tensor<1x?x4x2x4x16x2x8xi8> | |
%72 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%67 : tensor<1x?x4x2x4x16x2x8xi8>) outs(%71 : tensor<1x?x4x2x4x16x2x8xi8>) -> tensor<1x?x4x2x4x16x2x8xi8> | |
%73 = iree_gpu.multi_mma %70, %72, %68 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x?x8x4x16x2x8xi8>, tensor<1x?x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %73, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%dim = tensor.dim %66, %c1 : tensor<1x?x8x4x16x2x8xi8> | |
%69 = tensor.empty(%dim) : tensor<1x?x8x4x16x2x8xi8> | |
%70 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%66 : tensor<1x?x8x4x16x2x8xi8>) outs(%69 : tensor<1x?x8x4x16x2x8xi8>) -> tensor<1x?x8x4x16x2x8xi8> | |
%dim_0 = tensor.dim %70, %c1 : tensor<1x?x8x4x16x2x8xi8> | |
%71 = scf.for %arg0 = %c0 to %dim_0 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%72 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%75 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_1 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%74 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%76 = iree_gpu.multi_mma %73, %75, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %76 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After ConcretizeMmaShapesPass (iree-gpu-concretize-mma-shapes) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After PropagateReshapesByExpansionPass (iree-codegen-propagate-reshapes-by-expansion) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%extracted_slice = tensor.extract_slice %66[0, %arg0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x8x4x16x2x8xi8>) outs(%70 : tensor<1x1x8x4x16x2x8xi8>) -> tensor<1x1x8x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg0, 0, 0, 0, 0, 0, 0] [1, 1, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x4x2x4x16x2x8xi8> | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_0 : tensor<1x1x4x2x4x16x2x8xi8>) outs(%72 : tensor<1x1x4x2x4x16x2x8xi8>) -> tensor<1x1x4x2x4x16x2x8xi8> | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = iree_gpu.multi_mma %71, %73, %arg1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>, lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 1], workgroup = [1, 1, 0]}>} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32> | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After DistributeMmaToLanesPass (iree-gpu-distribute-mma-to-lanes) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) = (0, 0, 0, 0, 0, 0, 0) to (1, 1, 8, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 2, 8) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) = (0, 0, 0, 0, 0, 0, 0, 0) to (1, 1, 4, 2, 4, 16, 2, 8) step (1, 1, 1, 1, 1, 1, 2, 8) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %75, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%76 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index | |
%extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After NormalizeLoopBoundsPass (iree-codegen-normalize-loop-bounds) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7) | |
%77 = affine.apply affine_map<(d0) -> (d0)>(%arg6) | |
%78 = affine.apply affine_map<(d0) -> (d0)>(%arg5) | |
%79 = affine.apply affine_map<(d0) -> (d0)>(%arg4) | |
%80 = affine.apply affine_map<(d0) -> (d0)>(%arg3) | |
%81 = affine.apply affine_map<(d0) -> (d0)>(%arg2) | |
%82 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%80, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%81, %82, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %83 into %arg9[%81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8) | |
%77 = affine.apply affine_map<(d0) -> (d0)>(%arg7) | |
%78 = affine.apply affine_map<(d0) -> (d0)>(%arg6) | |
%79 = affine.apply affine_map<(d0) -> (d0)>(%arg5) | |
%80 = affine.apply affine_map<(d0) -> (d0)>(%arg4) | |
%81 = affine.apply affine_map<(d0) -> (d0)>(%arg3) | |
%82 = affine.apply affine_map<(d0) -> (d0)>(%arg2) | |
%83 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%81, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%82, %83, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%84 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %84 into %arg10[%82, %81, %80, %79, %78, %77, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:5 = affine.delinearize_index %75 into (%c1, %c4, %c16, %c1, %c1) : index, index, index, index, index | |
%extracted_slice = tensor.extract_slice %71[0, 0, %76#0, %76#1, %76#2, %76#3, %76#4] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:6 = affine.delinearize_index %77 into (%c4, %c1, %c4, %c16, %c1, %c1) : index, index, index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, %78#1, %78#2, %78#3, %78#4, %78#5] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%80:6 = affine.delinearize_index %79 into (%c1, %c4, %c1, %c4, %c16, %c1) : index, index, index, index, index, index | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %81 into %arg3[0, 0, %80#0, %80#1, %80#2, %80#3, %80#4, %80#5] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index | |
%extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%80:3 = affine.delinearize_index %79 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %81 into %arg3[0, 0, 0, %80#0, 0, %80#1, %80#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%70 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%71 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %70) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%72 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %72) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index | |
%extracted_slice = tensor.extract_slice %71[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %69, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index | |
%extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = tensor.empty() : tensor<1x1x8x4x16x2x8xi8> | |
%70 = tensor.empty() : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) in (1, 1, 8, 4, 16, 1, 1) shared_outs(%arg9 = %69) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg8) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg7) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %66[%arg2, %77, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg9[%arg2, %arg3, %arg4, %arg5, %arg6, %76, %75] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%73 = scf.forall (%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9) in (1, 1, 4, 2, 4, 16, 1, 1) shared_outs(%arg10 = %70) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg9) | |
%76 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg8) | |
%77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%arg3, %arg0] | |
%extracted_slice = tensor.extract_slice %67[%arg2, %77, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%78 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_0 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg10[%arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %76, %75] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
} | |
} {mapping = [#gpu.thread<linear_dim_7>, #gpu.thread<linear_dim_6>, #gpu.thread<linear_dim_5>, #gpu.thread<linear_dim_4>, #gpu.thread<linear_dim_3>, #gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]} | |
%74 = scf.forall (%arg2) in (256) shared_outs(%arg3 = %arg1) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%75 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg2) | |
%76:2 = affine.delinearize_index %75 into (%c4, %c16) : index, index | |
%extracted_slice = tensor.extract_slice %72[0, 0, 0, %76#0, %76#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg2) | |
%78:3 = affine.delinearize_index %77 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %73[0, 0, %78#0, 0, %78#1, %78#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%79 = iree_gpu.multi_mma %extracted_slice, %extracted_slice_0, %extracted_slice_1 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %79 into %arg3[0, 0, 0, %78#0, 0, %78#1, %78#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
scf.yield %74 : tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After FuseAndHoistParallelLoopsPass (iree-gpu-fuse-and-hoist-parallel-loops) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:7 = affine.delinearize_index %81 into (%c1, %c1, %c8, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index | |
%83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#6) | |
%84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#5) | |
%85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2] | |
%extracted_slice_2 = tensor.extract_slice %66[%82#0, %85, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %84, %83] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8> | |
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:8 = affine.delinearize_index %81 into (%c1, %c1, %c4, %c2, %c4, %c16, %c1, %c1) : index, index, index, index, index, index, index, index | |
%83 = affine.apply affine_map<(d0) -> (d0 * 8)>(%82#7) | |
%84 = affine.apply affine_map<(d0) -> (d0 * 2)>(%82#6) | |
%85 = affine.apply affine_map<()[s0, s1] -> (s0 + s1)>()[%82#1, %arg2] | |
%extracted_slice_2 = tensor.extract_slice %67[%82#0, %85, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%86 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %86 into %arg6[%82#0, %82#1, %82#2, %82#3, %82#4, %82#5, %84, %83] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x4x2x4x16x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index | |
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8> | |
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x4x2x4x16x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index | |
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%75 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8> | |
%76 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x4x2x4x16x2x8xi8> | |
%77 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%78:2 = affine.delinearize_index %77 into (%c4, %c16) : index, index | |
%extracted_slice_0 = tensor.extract_slice %75[0, 0, 0, %78#0, %78#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %76[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %74 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After LoopInvariantCodeMotion (loop-invariant-code-motion) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77 = iree_gpu.barrier_region ins(%69 : tensor<1x1x8x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x8x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8> | |
%78 = iree_gpu.barrier_region ins(%70 : tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x4x2x4x16x2x8xi8>): | |
%80 = scf.for %arg5 = %c0 to %c512 step %c256 iter_args(%arg6 = %arg4) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg5, %arg0) | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg6[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %80 : tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %78[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%79 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %79 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CombineBarrierRegionsPass (iree-gpu-combine-barrier-regions) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%79 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%80 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%83 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %83 into %arg7[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %79, %80 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = iree_gpu.multi_mma %extracted_slice_0, %extracted_slice_1, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<1x1x8x1x1x2x8xi8>, tensor<1x1x1x2x1x1x2x8xi8> into tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %78 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After VectorizeIREEGPUOpsPass (iree-gpu-vectorize-ops) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%extracted_slice_3 = tensor.extract_slice %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%87 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%extracted_slice_2 : tensor<1x1x1x1x1x1x2x8xi8>) outs(%extracted_slice_3 : tensor<1x1x1x1x1x1x2x8xi8>) -> tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %87 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %extracted_slice) -> (tensor<1x1x8x1x2x1x1x4xi32>) { | |
%77:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_2 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_2 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_2 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%extracted_slice_0 = tensor.extract_slice %77#0[0, 0, 0, %75#0, %75#1, 0, 0] [1, 1, 8, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x16x2x8xi8> to tensor<1x1x8x1x1x2x8xi8> | |
%extracted_slice_1 = tensor.extract_slice %77#1[0, 0, %73#0, 0, %73#1, %73#2, 0, 0] [1, 1, 1, 2, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x4x2x4x16x2x8xi8> to tensor<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x1x1x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%79 = vector.transfer_read %extracted_slice_1[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x1x2x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%80 = vector.transfer_read %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x1x2x1x1x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%81 = iree_gpu.multi_mma %78, %79, %80 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
%82 = vector.transfer_write %81, %arg3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : tensor<1x1x8x1x2x1x1x4xi32> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %76 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After GPUInferMemorySpacePass (iree-codegen-gpu-infer-memory-space) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%66 = flow.dispatch.tensor.load %63, offsets = [%workgroup_id_y, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%37, %42} -> tensor<1x?x8x4x16x2x8xi8> | |
%67 = flow.dispatch.tensor.load %64, offsets = [%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %42, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%47, %52} -> tensor<1x?x4x2x4x16x2x8xi8> | |
%68 = flow.dispatch.tensor.load %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} -> tensor<1x1x8x4x2x4x16x4xi32> | |
%69 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x8x4x16x2x8xi8> | |
%70 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<1x1x4x2x4x16x2x8xi8> | |
%71 = scf.forall (%arg0) in (256) shared_outs(%arg1 = %68) -> (tensor<1x1x8x4x2x4x16x4xi32>) { | |
%72 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%73:3 = affine.delinearize_index %72 into (%c4, %c4, %c16) : index, index, index | |
%extracted_slice = tensor.extract_slice %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> to tensor<1x1x8x1x2x1x1x4xi32> | |
%74 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%75:2 = affine.delinearize_index %74 into (%c4, %c16) : index, index | |
%76 = vector.transfer_read %arg1[%c0, %c0, %c0, %73#0, %c0, %73#1, %73#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x8x4x2x4x16x4xi32>, vector<1x1x8x1x2x1x1x4xi32> | |
%77 = scf.for %arg2 = %c0 to %42 step %c1 iter_args(%arg3 = %76) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
%79:2 = iree_gpu.barrier_region ins(%69, %70 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8>) { | |
^bb0(%arg4: tensor<1x1x8x4x16x2x8xi8>, %arg5: tensor<1x1x4x2x4x16x2x8xi8>): | |
%83 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg4) -> (tensor<1x1x8x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:3 = affine.delinearize_index %85 into (%c8, %c4, %c16) : index, index, index | |
%extracted_slice_0 = tensor.extract_slice %66[0, %arg2, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x?x8x4x16x2x8xi8> to tensor<1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x2x8xi8> into tensor<1x1x8x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x8x4x16x2x8xi8> | |
} {unroll_loop} | |
%84 = scf.for %arg6 = %c0 to %c512 step %c256 iter_args(%arg7 = %arg5) -> (tensor<1x1x4x2x4x16x2x8xi8>) { | |
%85 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg0) | |
%86:4 = affine.delinearize_index %85 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%extracted_slice_0 = tensor.extract_slice %67[0, %arg2, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x4x2x4x16x2x8xi8> to tensor<1x1x1x1x1x1x2x8xi8> | |
%inserted_slice = tensor.insert_slice %extracted_slice_0 into %arg7[0, 0, %86#0, %86#1, %86#2, %86#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x1x1x1x1x2x8xi8> into tensor<1x1x4x2x4x16x2x8xi8> | |
scf.yield %inserted_slice : tensor<1x1x4x2x4x16x2x8xi8> | |
} {unroll_loop} | |
iree_gpu.yield %83, %84 : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
} : tensor<1x1x8x4x16x2x8xi8>, tensor<1x1x4x2x4x16x2x8xi8> | |
%80 = vector.transfer_read %79#0[%c0, %c0, %c0, %75#0, %75#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : tensor<1x1x8x4x16x2x8xi8>, vector<1x1x8x1x1x2x8xi8> | |
%81 = vector.transfer_read %79#1[%c0, %c0, %73#0, %c0, %73#1, %73#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : tensor<1x1x4x2x4x16x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> | |
%82 = iree_gpu.multi_mma %80, %81, %arg3 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %82 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
%78 = vector.transfer_write %77, %extracted_slice[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, tensor<1x1x8x1x2x1x1x4xi32> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %78 into %arg1[0, 0, 0, %73#0, 0, %73#1, %73#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x1x2x1x1x4xi32> into tensor<1x1x8x4x2x4x16x4xi32> | |
} | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
flow.dispatch.tensor.store %71, %65, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0], sizes = [1, 1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x1x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%57, %62} | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
%72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index | |
%subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
%73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
%72 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc) -> (memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>) { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index | |
%subview_6 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
scf.yield %arg4 : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
%73 = scf.for %arg3 = %c0 to %c512 step %c256 iter_args(%arg4 = %alloc_2) -> (memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>) { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_6 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %arg4[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
scf.yield %arg4 : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%74 = vector.transfer_read %72[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%75 = vector.transfer_read %73[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_6 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_6 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_7 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_6, %subview_7 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
%subview_3 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_1, %subview_3 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview_3, %subview_3 : memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
memref.copy %subview_1, %subview_1 : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
return | |
} | |
// -----// IR Dump After GPUVerifyDistributionPass (iree-codegen-gpu-verify-distribution) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.forall (%arg0) in (256) { | |
%66 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%arg0) | |
%67:3 = affine.delinearize_index %66 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %67#0, 0, %67#1, %67#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%arg0) | |
%69:2 = affine.delinearize_index %68 into (%c4, %c16) : index, index | |
%70 = vector.transfer_read %subview_1[%c0, %c0, %c0, %67#0, %c0, %67#1, %67#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%71 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %70) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:3 = affine.delinearize_index %75 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %76#0, %76#1, %76#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%75 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %arg0) | |
%76:4 = affine.delinearize_index %75 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %76#0, %76#1, %76#2, %76#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_4, %subview_5 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%72 = vector.transfer_read %alloc[%c0, %c0, %c0, %69#0, %69#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%73 = vector.transfer_read %alloc_2[%c0, %c0, %67#0, %c0, %67#1, %67#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%74 = iree_gpu.multi_mma %72, %73, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %74 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %71, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#gpu.thread<linear_dim_0>]} | |
return | |
} | |
// -----// IR Dump After GPUDistributeForallPass (iree-codegen-gpu-distribute-forall) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%thread_id_x = gpu.thread_id x | |
%thread_id_y = gpu.thread_id y | |
%thread_id_z = gpu.thread_id z | |
%0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z] | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c2 = arith.constant 2 : index | |
%c256 = arith.constant 256 : index | |
%c512 = arith.constant 512 : index | |
%c8 = arith.constant 8 : index | |
%c16 = arith.constant 16 : index | |
%c4 = arith.constant 4 : index | |
%c0 = arith.constant 0 : index | |
%c1 = arith.constant 1 : index | |
%c32_i64 = arith.constant 32 : i64 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%18 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.extui %2 : i32 to i64 | |
%21 = arith.shli %20, %c32_i64 : i64 | |
%22 = arith.ori %19, %21 : i64 | |
%23 = arith.index_castui %22 : i64 to index | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.extui %4 : i32 to i64 | |
%26 = arith.shli %25, %c32_i64 : i64 | |
%27 = arith.ori %24, %26 : i64 | |
%28 = arith.index_castui %27 : i64 to index | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.extui %6 : i32 to i64 | |
%31 = arith.shli %30, %c32_i64 : i64 | |
%32 = arith.ori %29, %31 : i64 | |
%33 = arith.index_castui %32 : i64 to index | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.extui %8 : i32 to i64 | |
%36 = arith.shli %35, %c32_i64 : i64 | |
%37 = arith.ori %34, %36 : i64 | |
%38 = arith.index_castui %37 : i64 to index | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.extui %10 : i32 to i64 | |
%41 = arith.shli %40, %c32_i64 : i64 | |
%42 = arith.ori %39, %41 : i64 | |
%43 = arith.index_castui %42 : i64 to index | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.extui %12 : i32 to i64 | |
%46 = arith.shli %45, %c32_i64 : i64 | |
%47 = arith.ori %44, %46 : i64 | |
%48 = arith.index_castui %47 : i64 to index | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.extui %14 : i32 to i64 | |
%51 = arith.shli %50, %c32_i64 : i64 | |
%52 = arith.ori %49, %51 : i64 | |
%53 = arith.index_castui %52 : i64 to index | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.extui %16 : i32 to i64 | |
%56 = arith.shli %55, %c32_i64 : i64 | |
%57 = arith.ori %54, %56 : i64 | |
%58 = arith.index_castui %57 : i64 to index | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.extui %18 : i32 to i64 | |
%61 = arith.shli %60, %c32_i64 : i64 | |
%62 = arith.ori %59, %61 : i64 | |
%63 = arith.index_castui %62 : i64 to index | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%23) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%38, %43} | |
memref.assume_alignment %64, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%28) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%48, %53} | |
memref.assume_alignment %65, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%66 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%33) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%58, %63} | |
memref.assume_alignment %66, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %64[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %43, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %65[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %43, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %66[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%c256_3 = arith.constant 256 : index | |
%c0_4 = arith.constant 0 : index | |
%c256_5 = arith.constant 256 : index | |
%c256_6 = arith.constant 256 : index | |
scf.for %arg0 = %c0_4 to %c256_5 step %c256_6 { | |
%67 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z] | |
%68 = affine.delinearize_index %67 into (%c256_3) : index | |
%69 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%68) | |
%70:3 = affine.delinearize_index %69 into (%c4, %c4, %c16) : index, index, index | |
%subview_7 = memref.subview %subview_1[0, 0, 0, %70#0, 0, %70#1, %70#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%71 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%68) | |
%72:2 = affine.delinearize_index %71 into (%c4, %c16) : index, index | |
%73 = vector.transfer_read %subview_1[%c0, %c0, %c0, %70#0, %c0, %70#1, %70#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%74 = scf.for %arg1 = %c0 to %43 step %c1 iter_args(%arg2 = %73) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68) | |
%79:3 = affine.delinearize_index %78 into (%c8, %c4, %c16) : index, index, index | |
%subview_8 = memref.subview %subview[0, %arg1, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %alloc[0, 0, %79#0, %79#1, %79#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%78 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %68) | |
%79:4 = affine.delinearize_index %78 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_8 = memref.subview %subview_0[0, %arg1, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_9 = memref.subview %alloc_2[0, 0, %79#0, %79#1, %79#2, %79#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
memref.copy %subview_8, %subview_9 : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%75 = vector.transfer_read %alloc[%c0, %c0, %c0, %72#0, %72#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%76 = vector.transfer_read %alloc_2[%c0, %c0, %70#0, %c0, %70#1, %70#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%77 = iree_gpu.multi_mma %75, %76, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %77 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %74, %subview_7[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
return | |
} | |
// -----// IR Dump After VectorizeMemrefCopyPass (iree-codegen-vectorize-memref-copy) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c2 = arith.constant 2 : index | |
%c0_i8 = arith.constant 0 : i8 | |
%c0_i32 = arith.constant 0 : i32 | |
%thread_id_x = gpu.thread_id x | |
%thread_id_y = gpu.thread_id y | |
%thread_id_z = gpu.thread_id z | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.for %arg0 = %c0 to %c256 step %c256 { | |
%66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z] | |
%67 = affine.delinearize_index %66 into (%c256) : index | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67) | |
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67) | |
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index | |
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67) | |
%78:3 = affine.delinearize_index %77 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %78#0, %78#1, %78#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8> | |
vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%77 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67) | |
%78:4 = affine.delinearize_index %77 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %78#0, %78#1, %78#2, %78#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%79 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8> | |
vector.transfer_write %79, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%76 = iree_gpu.multi_mma %74, %75, %arg2 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<1x1x8x1x1x2x8xi8>, vector<1x1x1x2x1x1x2x8xi8> into vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %76 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
return | |
} | |
// -----// IR Dump After UnrollToIntrinsicsPass (iree-gpu-unroll-to-intrinsics) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c2 = arith.constant 2 : index | |
%c0_i8 = arith.constant 0 : i8 | |
%c0_i32 = arith.constant 0 : i32 | |
%thread_id_x = gpu.thread_id x | |
%thread_id_y = gpu.thread_id y | |
%thread_id_z = gpu.thread_id z | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
scf.for %arg0 = %c0 to %c256 step %c256 { | |
%66 = affine.apply affine_map<(d0)[s0, s1, s2] -> (d0 + s0 + s1 * 256 + s2 * 256)>(%arg0)[%thread_id_x, %thread_id_y, %thread_id_z] | |
%67 = affine.delinearize_index %66 into (%c256) : index | |
%68 = affine.apply affine_map<(d0) -> (d0 mod 256)>(%67) | |
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%70 = affine.apply affine_map<(d0) -> (d0 mod 64)>(%67) | |
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index | |
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%73 = scf.for %arg1 = %c0 to %42 step %c1 iter_args(%arg2 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67) | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg1, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8> | |
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg3 = %c0 to %c512 step %c256 { | |
%81 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg3, %67) | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg1, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8> | |
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8> | |
%77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.extract %arg2[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32> | |
%79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32> | |
%80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c2 = arith.constant 2 : index | |
%c0_i8 = arith.constant 0 : i8 | |
%c0_i32 = arith.constant 0 : i32 | |
%thread_id_x = gpu.thread_id x | |
%thread_id_y = gpu.thread_id y | |
%thread_id_z = gpu.thread_id z | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%alloc = memref.alloc() : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%alloc_2 = memref.alloc() : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> | |
%66 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 256 + s2 * 256)>()[%thread_id_x, %thread_id_y, %thread_id_z] | |
%67 = affine.delinearize_index %66 into (%c256) : index | |
%68 = affine.apply affine_map<()[s0] -> (s0 mod 256)>()[%67] | |
%69:3 = affine.delinearize_index %68 into (%c4, %c4, %c16) : index, index, index | |
%subview_3 = memref.subview %subview_1[0, 0, 0, %69#0, 0, %69#1, %69#2, 0] [1, 1, 8, 1, 2, 1, 1, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%70 = affine.apply affine_map<()[s0] -> (s0 mod 64)>()[%67] | |
%71:2 = affine.delinearize_index %70 into (%c4, %c16) : index, index | |
%72 = vector.transfer_read %subview_1[%c0, %c0, %c0, %69#0, %c0, %69#1, %69#2, %c0], %c0_i32 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x8x1x2x1x1x4xi32> | |
%73 = scf.for %arg0 = %c0 to %42 step %c1 iter_args(%arg1 = %72) -> (vector<1x1x8x1x2x1x1x4xi32>) { | |
gpu.barrier | |
scf.for %arg2 = %c0 to %c512 step %c256 { | |
%81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67] | |
%82:3 = affine.delinearize_index %81 into (%c8, %c4, %c16) : index, index, index | |
%subview_4 = memref.subview %subview[0, %arg0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc[0, 0, %82#0, %82#1, %82#2, 0, 0] [1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x1x1x1x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x2x8xi8> | |
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true]} : vector<1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x2x8xi8, strided<[8192, 8192, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
scf.for %arg2 = %c0 to %c512 step %c256 { | |
%81 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg2)[%67] | |
%82:4 = affine.delinearize_index %81 into (%c4, %c2, %c4, %c16) : index, index, index, index | |
%subview_4 = memref.subview %subview_0[0, %arg0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_5 = memref.subview %alloc_2[0, 0, %82#0, %82#1, %82#2, %82#3, 0, 0] [1, 1, 1, 1, 1, 1, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>> to memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
%83 = vector.transfer_read %subview_4[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x1x1x1x1x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x1x1x1x1x1x2x8xi8> | |
vector.transfer_write %83, %subview_5[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x1x1x1x1x2x8xi8>, memref<1x1x1x1x1x1x2x8xi8, strided<[8192, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #gpu.address_space<workgroup>> | |
} {unroll_loop} | |
gpu.barrier | |
%74 = vector.transfer_read %alloc[%c0, %c0, %c0, %71#0, %71#1, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true]} : memref<1x1x8x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x8x1x1x2x8xi8> | |
%75 = vector.transfer_read %alloc_2[%c0, %c0, %69#0, %c0, %69#1, %69#2, %c0, %c0], %c0_i8 {in_bounds = [true, true, true, true, true, true, true, true]} : memref<1x1x4x2x4x16x2x8xi8, #gpu.address_space<workgroup>>, vector<1x1x1x2x1x1x2x8xi8> | |
%76 = vector.extract %74[0, 0] : vector<8x1x1x2x8xi8> from vector<1x1x8x1x1x2x8xi8> | |
%77 = vector.extract %75[0, 0] : vector<1x2x1x1x2x8xi8> from vector<1x1x1x2x1x1x2x8xi8> | |
%78 = vector.extract %arg1[0, 0] : vector<8x1x2x1x1x4xi32> from vector<1x1x8x1x2x1x1x4xi32> | |
%79 = iree_gpu.multi_mma %76, %77, %78 {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>, affine_map<() -> ()>], iterator_types = [], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : vector<8x1x1x2x8xi8>, vector<1x2x1x1x2x8xi8> into vector<8x1x2x1x1x4xi32> | |
%80 = vector.broadcast %79 : vector<8x1x2x1x1x4xi32> to vector<1x1x8x1x2x1x1x4xi32> | |
scf.yield %80 : vector<1x1x8x1x2x1x1x4xi32> | |
} | |
vector.transfer_write %73, %subview_3[%c0, %c0, %c0, %c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true, true, true, true]} : vector<1x1x8x1x2x1x1x4xi32>, memref<1x1x8x1x2x1x1x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @foo_dispatch_6() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>}>} { | |
%c32_i64 = arith.constant 32 : i64 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c4 = arith.constant 4 : index | |
%c16 = arith.constant 16 : index | |
%c8 = arith.constant 8 : index | |
%c512 = arith.constant 512 : index | |
%c256 = arith.constant 256 : index | |
%c2 = arith.constant 2 : index | |
%c0_i8 = arith.constant 0 : i8 | |
%c0_i32 = arith.constant 0 : i32 | |
%thread_id_x = gpu.thread_id x | |
%thread_id_y = gpu.thread_id y | |
%thread_id_z = gpu.thread_id z | |
%0 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 | |
%1 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32 | |
%2 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32 | |
%3 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32 | |
%4 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32 | |
%5 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32 | |
%6 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32 | |
%7 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32 | |
%8 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32 | |
%9 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32 | |
%10 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(10) : i32 | |
%11 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(11) : i32 | |
%12 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(12) : i32 | |
%13 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(13) : i32 | |
%14 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(14) : i32 | |
%15 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(15) : i32 | |
%16 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(16) : i32 | |
%17 = hal.interface.constant.load layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(17) : i32 | |
%18 = arith.extui %0 : i32 to i64 | |
%19 = arith.extui %1 : i32 to i64 | |
%20 = arith.shli %19, %c32_i64 : i64 | |
%21 = arith.ori %18, %20 : i64 | |
%22 = arith.index_castui %21 : i64 to index | |
%23 = arith.extui %2 : i32 to i64 | |
%24 = arith.extui %3 : i32 to i64 | |
%25 = arith.shli %24, %c32_i64 : i64 | |
%26 = arith.ori %23, %25 : i64 | |
%27 = arith.index_castui %26 : i64 to index | |
%28 = arith.extui %4 : i32 to i64 | |
%29 = arith.extui %5 : i32 to i64 | |
%30 = arith.shli %29, %c32_i64 : i64 | |
%31 = arith.ori %28, %30 : i64 | |
%32 = arith.index_castui %31 : i64 to index | |
%33 = arith.extui %6 : i32 to i64 | |
%34 = arith.extui %7 : i32 to i64 | |
%35 = arith.shli %34, %c32_i64 : i64 | |
%36 = arith.ori %33, %35 : i64 | |
%37 = arith.index_castui %36 : i64 to index | |
%38 = arith.extui %8 : i32 to i64 | |
%39 = arith.extui %9 : i32 to i64 | |
%40 = arith.shli %39, %c32_i64 : i64 | |
%41 = arith.ori %38, %40 : i64 | |
%42 = arith.index_castui %41 : i64 to index | |
%43 = arith.extui %10 : i32 to i64 | |
%44 = arith.extui %11 : i32 to i64 | |
%45 = arith.shli %44, %c32_i64 : i64 | |
%46 = arith.ori %43, %45 : i64 | |
%47 = arith.index_castui %46 : i64 to index | |
%48 = arith.extui %12 : i32 to i64 | |
%49 = arith.extui %13 : i32 to i64 | |
%50 = arith.shli %49, %c32_i64 : i64 | |
%51 = arith.ori %48, %50 : i64 | |
%52 = arith.index_castui %51 : i64 to index | |
%53 = arith.extui %14 : i32 to i64 | |
%54 = arith.extui %15 : i32 to i64 | |
%55 = arith.shli %54, %c32_i64 : i64 | |
%56 = arith.ori %53, %55 : i64 | |
%57 = arith.index_castui %56 : i64 to index | |
%58 = arith.extui %16 : i32 to i64 | |
%59 = arith.extui %17 : i32 to i64 | |
%60 = arith.shli %59, %c32_i64 : i64 | |
%61 = arith.ori %58, %60 : i64 | |
%62 = arith.index_castui %61 : i64 to index | |
%63 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%22) flags("ReadOnly|Indirect") : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%37, %42} | |
memref.assume_alignment %63, 1 : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%64 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%27) flags("ReadOnly|Indirect") : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%47, %52} | |
memref.assume_alignment %64, 1 : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%65 = hal.interface.binding.subspan layout(<constants = 18, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%32) flags(Indirect) : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%57, %62} | |
memref.assume_alignment %65, 1 : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%workgroup_id_x = hal.interface.workgroup.id[0] : index | |
%workgroup_id_y = hal.interface.workgroup.id[1] : index | |
%subview = memref.subview %63[%workgroup_id_y, 0, 0, 0, 0, 0, 0] [1, %42, 8, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x8x4x16x2x8xi8, strided<[?, 8192, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_0 = memref.subview %64[%workgroup_id_x, 0, 0, 0, 0, 0, 0, 0] [1, %42, 4, 2, 4, 16, 2, 8] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x?x4x2x4x16x2x8xi8, strided<[?, 8192, 2048, 1024, 256, 16, 8, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %65[%workgroup_id_y, %workgroup_id_x, 0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 2, 4, 16, 4] [1, 1, 1, 1, 1, 1, 1, 1] : memref<?x?x8x4x2x4x16x4xi32, strided<[?, 16384, 2048, 512, 256, 64, 4, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x1x8x4x2x4x |
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment