Created
October 4, 2024 20:55
-
-
Save bjacob/c3bd7dc431adf21e967aff444471075a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
module { | |
func.func @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- // | |
module { | |
util.func public @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- // | |
module { | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = util.call @_foo(%2, %5, %8) : (tensor<?x?xi8>, tensor<?x?xi8>, tensor<?x?xi32>) -> tensor<?x?xi32> | |
%c0 = arith.constant 0 : index | |
%dim = tensor.dim %9, %c0 : tensor<?x?xi32> | |
%c1 = arith.constant 1 : index | |
%dim_0 = tensor.dim %9, %c1 : tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%dim, %dim_0} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
util.func private @_foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func private @_foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
util.return %0 : tensor<?x?xi32> | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = util.call @_foo(%2, %5, %8) : (tensor<?x?xi8>, tensor<?x?xi8>, tensor<?x?xi32>) -> tensor<?x?xi32> | |
%dim = tensor.dim %9, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %9, %c1 : tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%dim, %dim_0} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After Inliner (inline) //----- // | |
module { | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SymbolDCE (symbol-dce) //----- // | |
module { | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {hal.device.targets = [#device_target_hip]} { | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After Convert1X1FilterConv2DToMatmulPass (iree-global-opt-convert-1x1-filter-conv2d-to-matmul) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = linalg.matmul ins(%2, %5 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%10 = hal.tensor.export %9 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %10 : !hal.buffer_view | |
} | |
// -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = iree_encoding.set_encoding %2 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%10 = iree_encoding.set_encoding %5 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%11 = iree_encoding.set_encoding %8 : tensor<?x?xi32> -> tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%12 = linalg.matmul ins(%9, %10 : tensor<?x?xi8, #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>, tensor<?x?xi8, #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) outs(%11 : tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> | |
%dim = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_0 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%13 = iree_encoding.unset_encoding %12 : tensor<?x?xi32, #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %13[0, 0] [%dim, %dim_0] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%14 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %14 : !hal.buffer_view | |
} | |
// -----// IR Dump After GPUMaterializeHostEncodingPass (iree-codegen-gpu-materialize-host-encoding) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map3 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map4 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%dim = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%9 = affine.apply #map()[%dim] | |
%10 = affine.apply #map1()[%dim_0] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%pack = tensor.pack %2 padding_value(%c0_i8 : i8) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 64] into %11 : tensor<?x?xi8> -> tensor<?x?x128x64xi8> | |
%expanded = tensor.expand_shape %pack [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%12 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x?x8x16x2x4x8xi8>) outs(%12 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%dim_1 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%dim_2 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%13 = affine.apply #map()[%dim_2] | |
%14 = affine.apply #map1()[%dim_1] | |
%15 = tensor.empty(%13, %14) : tensor<?x?x128x64xi8> | |
%pack_3 = tensor.pack %5 padding_value(%c0_i8 : i8) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [128, 64] into %15 : tensor<?x?xi8> -> tensor<?x?x128x64xi8> | |
%expanded_4 = tensor.expand_shape %pack_3 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%13, %14, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%16 = tensor.empty(%13, %14) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_5 = linalg.transpose ins(%expanded_4 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%16 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%dim_6 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_7 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%17 = affine.apply #map()[%dim_6] | |
%18 = affine.apply #map()[%dim_7] | |
%19 = tensor.empty(%17, %18) : tensor<?x?x128x128xi32> | |
%pack_8 = tensor.pack %8 padding_value(%c0_i32 : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 128] into %19 : tensor<?x?xi32> -> tensor<?x?x128x128xi32> | |
%expanded_9 = tensor.expand_shape %pack_8 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%17, %18, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%20 = tensor.empty(%17, %18) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_10 = linalg.transpose ins(%expanded_9 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%20 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%21 = iree_gpu.multi_mma %transposed, %transposed_5, %transposed_10 {indexing_maps = [#map2, #map3, #map4], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_11 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_12 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%dim_13 = tensor.dim %21, %c0 : tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_14 = tensor.dim %21, %c1 : tensor<?x?x8x4x2x4x16x4xi32> | |
%22 = tensor.empty(%dim_13, %dim_14) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_15 = linalg.transpose ins(%21 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%22 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%23 = tensor.empty(%dim_11, %dim_12) : tensor<?x?xi32> | |
%unpack = tensor.unpack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [128, 128] into %23 : tensor<?x?x128x128xi32> -> tensor<?x?xi32> | |
%24 = hal.tensor.export %unpack "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %24 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DecomposePackUnPackOpsPass (iree-codegen-decompose-pack-unpack-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%dim = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%dim_0] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%12 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_1, %dim] | |
%dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%13 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)>()[%dim_2, %dim_0] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%dim_3 = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_1, %dim_3, %dim] | |
%dim_4 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%15 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)>()[%dim_2, %dim_4, %dim_0] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_5 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%dim_7 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%dim_8 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_8] | |
%20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%dim_7] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%dim_9 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%22 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_9, %dim_8] | |
%dim_10 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%23 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)>()[%dim_10, %dim_7] | |
%padded_11 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%dim_12 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%24 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)>()[%dim_10, %dim_12, %dim_7] | |
%dim_13 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%25 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_9, %dim_13, %dim_8] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_14 = tensor.expand_shape %padded_11 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_15 = linalg.transpose ins(%expanded_14 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_16 = tensor.expand_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_17 = linalg.transpose ins(%expanded_16 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%dim_18 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_19 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_18] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%dim_19] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%dim_20 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%32 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_20, %dim_18] | |
%dim_21 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%33 = affine.apply affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)>()[%dim_21, %dim_19] | |
%padded_22 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%dim_23 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%34 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_20, %dim_23, %dim_18] | |
%dim_24 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%35 = affine.apply affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)>()[%dim_21, %dim_24, %dim_19] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_25 = tensor.expand_shape %padded_22 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_26 = linalg.transpose ins(%expanded_25 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_27 = tensor.expand_shape %transposed_26 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_28 = linalg.transpose ins(%expanded_27 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_6, %transposed_17, %transposed_28 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_29 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_30 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%dim_31 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_32 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%dim_31, %dim_32) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_33 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_33 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%dim_29, %dim_30) : tensor<?x?xi32> | |
%dim_34 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_35 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32> | |
%42 = tensor.empty(%dim_34, %dim_35) : tensor<?x128x?x128xi32> | |
%transposed_36 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_37 = tensor.collapse_shape %transposed_36 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_37[0, 0] [%dim_29, %dim_30] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0, s1] -> (-s0 + (s1 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0, s1, s2] -> (-s0 + s1 + (s2 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%dim = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%9 = affine.apply #map()[%dim] | |
%10 = affine.apply #map1()[%dim_0] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%12 = affine.apply #map2()[%dim_1, %dim] | |
%dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%13 = affine.apply #map3()[%dim_2, %dim_0] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%dim_3 = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%14 = affine.apply #map4()[%dim_1, %dim_3, %dim] | |
%dim_4 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%15 = affine.apply #map5()[%dim_2, %dim_4, %dim_0] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_5 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%dim_7 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%dim_8 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%19 = affine.apply #map()[%dim_8] | |
%20 = affine.apply #map1()[%dim_7] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%dim_9 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%22 = affine.apply #map2()[%dim_9, %dim_8] | |
%dim_10 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%23 = affine.apply #map3()[%dim_10, %dim_7] | |
%padded_11 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%dim_12 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%24 = affine.apply #map5()[%dim_10, %dim_12, %dim_7] | |
%dim_13 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%25 = affine.apply #map4()[%dim_9, %dim_13, %dim_8] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_14 = tensor.expand_shape %padded_11 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_15 = linalg.transpose ins(%expanded_14 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_16 = tensor.expand_shape %transposed_15 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_17 = linalg.transpose ins(%expanded_16 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%dim_18 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_19 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%29 = affine.apply #map()[%dim_18] | |
%30 = affine.apply #map()[%dim_19] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%dim_20 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%32 = affine.apply #map2()[%dim_20, %dim_18] | |
%dim_21 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%33 = affine.apply #map2()[%dim_21, %dim_19] | |
%padded_22 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%dim_23 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%34 = affine.apply #map4()[%dim_20, %dim_23, %dim_18] | |
%dim_24 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%35 = affine.apply #map4()[%dim_21, %dim_24, %dim_19] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_25 = tensor.expand_shape %padded_22 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_26 = linalg.transpose ins(%expanded_25 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_27 = tensor.expand_shape %transposed_26 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_28 = linalg.transpose ins(%expanded_27 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_6, %transposed_17, %transposed_28 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_29 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_30 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%dim_31 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_32 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%dim_31, %dim_32) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_33 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_33 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%dim_29, %dim_30) : tensor<?x?xi32> | |
%dim_34 = tensor.dim %39, %c0 : tensor<?x?x8x4x2x4x16x4xi32> | |
%dim_35 = tensor.dim %39, %c1 : tensor<?x?x8x4x2x4x16x4xi32> | |
%42 = tensor.empty(%dim_34, %dim_35) : tensor<?x128x?x128xi32> | |
%transposed_36 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_37 = tensor.collapse_shape %transposed_36 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_37[0, 0] [%dim_29, %dim_30] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%19 = affine.apply #map()[%4] | |
%20 = affine.apply #map1()[%3] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%22 = affine.apply #map2()[%4] | |
%23 = affine.apply #map3()[%3] | |
%padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%24 = affine.apply #map5()[%3] | |
%25 = affine.apply #map4()[%4] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%29 = affine.apply #map()[%6] | |
%30 = affine.apply #map()[%7] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%32 = affine.apply #map2()[%6] | |
%33 = affine.apply #map2()[%7] | |
%padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%34 = affine.apply #map4()[%6] | |
%35 = affine.apply #map4()[%7] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32> | |
%transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%19 = affine.apply #map()[%4] | |
%20 = affine.apply #map1()[%3] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%22 = affine.apply #map2()[%4] | |
%23 = affine.apply #map3()[%3] | |
%padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%24 = affine.apply #map5()[%3] | |
%25 = affine.apply #map4()[%4] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%29 = affine.apply #map()[%6] | |
%30 = affine.apply #map()[%7] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%32 = affine.apply #map2()[%6] | |
%33 = affine.apply #map2()[%7] | |
%padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%34 = affine.apply #map4()[%6] | |
%35 = affine.apply #map4()[%7] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32> | |
%transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%19 = affine.apply #map()[%4] | |
%20 = affine.apply #map1()[%3] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%22 = affine.apply #map2()[%4] | |
%23 = affine.apply #map3()[%3] | |
%padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%24 = affine.apply #map5()[%3] | |
%25 = affine.apply #map4()[%4] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%29 = affine.apply #map()[%6] | |
%30 = affine.apply #map()[%7] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%32 = affine.apply #map2()[%6] | |
%33 = affine.apply #map2()[%7] | |
%padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%34 = affine.apply #map4()[%6] | |
%35 = affine.apply #map4()[%7] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32> | |
%transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%transposed = linalg.transpose ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) permutation = [0, 2, 1, 3] | |
%expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%18 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%transposed_1 = linalg.transpose ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%18 : tensor<?x?x8x4x16x2x8xi8>) permutation = [0, 1, 2, 5, 3, 4, 6] | |
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%21 = tensor.empty(%19, %20) : tensor<?x?x128x64xi8> | |
%22 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%23 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_2 = tensor.pad %5 low[0, 0] high[%23, %22] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = arith.divui %24, %c64 : index | |
%27 = arith.divui %25, %c128 : index | |
%expanded_3 = tensor.expand_shape %padded_2 [[0, 1], [2, 3]] output_shape [%26, 64, %27, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%transposed_4 = linalg.transpose ins(%expanded_3 : tensor<?x64x?x128xi8>) outs(%21 : tensor<?x?x128x64xi8>) permutation = [2, 0, 3, 1] | |
%expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%19, %20, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%28 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%transposed_6 = linalg.transpose ins(%expanded_5 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%28 : tensor<?x?x4x2x4x16x2x8xi8>) permutation = [0, 1, 2, 3, 6, 4, 5, 7] | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%31 = tensor.empty(%29, %30) : tensor<?x?x128x128xi32> | |
%32 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%33 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_7 = tensor.pad %8 low[0, 0] high[%32, %33] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%34 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%36 = arith.divui %34, %c128 : index | |
%37 = arith.divui %35, %c128 : index | |
%expanded_8 = tensor.expand_shape %padded_7 [[0, 1], [2, 3]] output_shape [%36, 128, %37, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%transposed_9 = linalg.transpose ins(%expanded_8 : tensor<?x128x?x128xi32>) outs(%31 : tensor<?x?x128x128xi32>) permutation = [0, 2, 1, 3] | |
%expanded_10 = tensor.expand_shape %transposed_9 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%29, %30, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%38 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%transposed_11 = linalg.transpose ins(%expanded_10 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%38 : tensor<?x?x8x4x2x4x16x4xi32>) permutation = [0, 1, 2, 5, 6, 3, 7, 4] | |
%39 = iree_gpu.multi_mma %transposed_1, %transposed_6, %transposed_11 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32> | |
%transposed_12 = linalg.transpose ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%40 : tensor<?x?x8x4x4x4x2x16xi32>) permutation = [0, 1, 2, 5, 7, 3, 4, 6] | |
%collapsed = tensor.collapse_shape %transposed_12 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%41 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%42 = tensor.empty(%29, %30) : tensor<?x128x?x128xi32> | |
%transposed_13 = linalg.transpose ins(%collapsed : tensor<?x?x128x128xi32>) outs(%42 : tensor<?x128x?x128xi32>) permutation = [0, 2, 1, 3] | |
%collapsed_14 = tensor.collapse_shape %transposed_13 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_14[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = linalg.copy ins(%extracted_slice : tensor<?x?xi32>) outs(%41 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%44 = hal.tensor.export %43 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %44 : !hal.buffer_view | |
} | |
// -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%49 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%49 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %50 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x?xi32>) outs(%48 : tensor<?x?xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?xi32> | |
%52 = hal.tensor.export %51 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %52 : !hal.buffer_view | |
} | |
// -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%6, %7) : tensor<?x?xi32> | |
%49 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%49 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %50 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x?xi32>) outs(%48 : tensor<?x?xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?xi32> | |
%52 = hal.tensor.export %51 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %52 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply #map4()[%0] | |
%15 = affine.apply #map5()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map2()[%4] | |
%25 = affine.apply #map3()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply #map5()[%3] | |
%27 = affine.apply #map4()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply #map4()[%6] | |
%39 = affine.apply #map4()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%1] | |
%padded = tensor.pad %2 low[0, 0] high[%12, %13] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%14 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%15 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%16 = arith.divui %14, %c128 : index | |
%17 = arith.divui %15, %c64 : index | |
%expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%4] | |
%25 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 64) * 64)>()[%3] | |
%padded_1 = tensor.pad %5 low[0, 0] high[%25, %24] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i8 : i8 | |
} : tensor<?x?xi8> to tensor<?x?xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = arith.divui %26, %c64 : index | |
%29 = arith.divui %27, %c128 : index | |
%expanded_2 = tensor.expand_shape %padded_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> (-s0 + (s0 ceildiv 128) * 128)>()[%7] | |
%padded_4 = tensor.pad %8 low[0, 0] high[%36, %37] { | |
^bb0(%arg3: index, %arg4: index): | |
tensor.yield %c0_i32 : i32 | |
} : tensor<?x?xi32> to tensor<?x?xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = arith.divui %38, %c128 : index | |
%41 = arith.divui %39, %c128 : index | |
%expanded_5 = tensor.expand_shape %padded_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 64) * 64)> | |
#map4 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map5 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map10 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map12 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map13 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map15 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map16 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map17 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c1 = arith.constant 1 : index | |
%c0 = arith.constant 0 : index | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%dim = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%12 = affine.apply #map2()[%0, %dim] | |
%dim_0 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%13 = affine.apply #map3()[%1, %dim_0] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%dim_1 = tensor.dim %2, %c0 : tensor<?x?xi8> | |
%dim_2 = tensor.dim %2, %c1 : tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%dim_1, %dim_2] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = affine.apply #map4()[%0] | |
%17 = affine.apply #map5()[%1] | |
%18 = arith.divui %16, %c128 : index | |
%19 = arith.divui %17, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%18, 128, %19, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%21 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%22 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x8x16x2x4x8xi8>) outs(%21 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%23 = affine.apply #map()[%4] | |
%24 = affine.apply #map1()[%3] | |
%25 = tensor.empty(%23, %24) : tensor<?x?x128x64xi8> | |
%dim_4 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%26 = affine.apply #map3()[%3, %dim_4] | |
%dim_5 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%27 = affine.apply #map2()[%4, %dim_5] | |
%28 = tensor.empty(%26, %27) : tensor<?x?xi8> | |
%29 = linalg.fill ins(%c0_i8 : i8) outs(%28 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%dim_6 = tensor.dim %5, %c0 : tensor<?x?xi8> | |
%dim_7 = tensor.dim %5, %c1 : tensor<?x?xi8> | |
%inserted_slice_8 = tensor.insert_slice %5 into %29[0, 0] [%dim_6, %dim_7] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%30 = affine.apply #map5()[%3] | |
%31 = affine.apply #map4()[%4] | |
%32 = arith.divui %30, %c64 : index | |
%33 = arith.divui %31, %c128 : index | |
%expanded_9 = tensor.expand_shape %inserted_slice_8 [[0, 1], [2, 3]] output_shape [%32, 64, %33, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%34 = linalg.generic {indexing_maps = [#map10, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_9 : tensor<?x64x?x128xi8>) outs(%25 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_10 = tensor.expand_shape %34 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%23, %24, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%35 = tensor.empty(%23, %24) : tensor<?x?x4x2x4x16x2x8xi8> | |
%36 = linalg.generic {indexing_maps = [#map11, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_10 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%35 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%37 = affine.apply #map()[%6] | |
%38 = affine.apply #map()[%7] | |
%39 = tensor.empty(%37, %38) : tensor<?x?x128x128xi32> | |
%dim_11 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%40 = affine.apply #map2()[%6, %dim_11] | |
%dim_12 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%41 = affine.apply #map2()[%7, %dim_12] | |
%42 = tensor.empty(%40, %41) : tensor<?x?xi32> | |
%43 = linalg.fill ins(%c0_i32 : i32) outs(%42 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%dim_13 = tensor.dim %8, %c0 : tensor<?x?xi32> | |
%dim_14 = tensor.dim %8, %c1 : tensor<?x?xi32> | |
%inserted_slice_15 = tensor.insert_slice %8 into %43[0, 0] [%dim_13, %dim_14] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%44 = affine.apply #map4()[%6] | |
%45 = affine.apply #map4()[%7] | |
%46 = arith.divui %44, %c128 : index | |
%47 = arith.divui %45, %c128 : index | |
%expanded_16 = tensor.expand_shape %inserted_slice_15 [[0, 1], [2, 3]] output_shape [%46, 128, %47, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%48 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_16 : tensor<?x128x?x128xi32>) outs(%39 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_17 = tensor.expand_shape %48 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%37, %38, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%49 = tensor.empty(%37, %38) : tensor<?x?x8x4x2x4x16x4xi32> | |
%50 = linalg.generic {indexing_maps = [#map13, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_17 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%49 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%51 = iree_gpu.multi_mma %22, %36, %50 {indexing_maps = [#map14, #map15, #map16], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%52 = tensor.empty(%37, %38) : tensor<?x?x8x4x4x4x2x16xi32> | |
%53 = linalg.generic {indexing_maps = [#map17, #map12], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%51 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%52 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %53 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%54 = tensor.empty(%37, %38) : tensor<?x128x?x128xi32> | |
%55 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%54 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_18 = tensor.collapse_shape %55 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_18[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%56 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %56 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%17 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%18 = arith.divui %16, %c128 : index | |
%19 = arith.divui %17, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%18, 128, %19, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %20 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%21 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%21 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%25 = tensor.empty(%23, %24) : tensor<?x?x128x64xi8> | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%28 = tensor.empty(%26, %27) : tensor<?x?xi8> | |
%29 = linalg.fill ins(%c0_i8 : i8) outs(%28 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %29[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%30 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%31 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%32 = arith.divui %30, %c64 : index | |
%33 = arith.divui %31, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%32, 64, %33, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%25 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %34 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%23, %24, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%35 = tensor.empty(%23, %24) : tensor<?x?x4x2x4x16x2x8xi8> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%35 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%37 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%38 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%39 = tensor.empty(%37, %38) : tensor<?x?x128x128xi32> | |
%40 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%41 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%42 = tensor.empty(%40, %41) : tensor<?x?xi32> | |
%43 = linalg.fill ins(%c0_i32 : i32) outs(%42 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %43[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%44 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%45 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%46 = arith.divui %44, %c128 : index | |
%47 = arith.divui %45, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%46, 128, %47, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%39 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %48 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%37, %38, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%49 = tensor.empty(%37, %38) : tensor<?x?x8x4x2x4x16x4xi32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%49 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%51 = iree_gpu.multi_mma %22, %36, %50 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%52 = tensor.empty(%37, %38) : tensor<?x?x8x4x4x4x2x16xi32> | |
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%51 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%52 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %53 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%54 = tensor.empty(%37, %38) : tensor<?x128x?x128xi32> | |
%55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%54 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %55 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%56 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %56 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map13 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map3()[%3] | |
%25 = affine.apply #map2()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map13 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map3()[%3] | |
%25 = affine.apply #map2()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map13 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map3()[%3] | |
%25 = affine.apply #map2()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After IPO (iree-util-ipo) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map13 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map3()[%3] | |
%25 = affine.apply #map2()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)> | |
#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d5, d3, d6)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)> | |
#map8 = affine_map<(d0, d1, d2, d3) -> (d1, d3, d0, d2)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d6, d4, d7)> | |
#map10 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> | |
#map11 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map13 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map14 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map15 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply #map2()[%0] | |
%13 = affine.apply #map3()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply #map3()[%3] | |
%25 = affine.apply #map2()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [#map8, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [#map9, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply #map2()[%6] | |
%37 = affine.apply #map2()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [#map11, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [#map12, #map13, #map14], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [#map15, #map10], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x128x?x64xi8>) outs(%11 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_0 = tensor.expand_shape %18 [[0], [1], [2, 3], [4, 5, 6]] output_shape [%9, %10, 8, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_0 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = tensor.empty(%21, %22) : tensor<?x?x128x64xi8> | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%26 = tensor.empty(%24, %25) : tensor<?x?xi8> | |
%27 = linalg.fill ins(%c0_i8 : i8) outs(%26 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %27[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%28 = arith.divui %24, %c64 : index | |
%29 = arith.divui %25, %c128 : index | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1], [2, 3]] output_shape [%28, 64, %29, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_2 : tensor<?x64x?x128xi8>) outs(%23 : tensor<?x?x128x64xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x128x64xi8> | |
%expanded_3 = tensor.expand_shape %30 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%21, %22, 4, 2, 16, 2, 4, 8] : tensor<?x?x128x64xi8> into tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = tensor.empty(%33, %34) : tensor<?x?x128x128xi32> | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%37 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%38 = tensor.empty(%36, %37) : tensor<?x?xi32> | |
%39 = linalg.fill ins(%c0_i32 : i32) outs(%38 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_4 = tensor.insert_slice %8 into %39[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%40 = arith.divui %36, %c128 : index | |
%41 = arith.divui %37, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%40, 128, %41, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_5 : tensor<?x128x?x128xi32>) outs(%35 : tensor<?x?x128x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x128x128xi32> | |
%expanded_6 = tensor.expand_shape %42 [[0], [1], [2, 3, 4], [5, 6, 7]] output_shape [%33, %34, 8, 4, 4, 4, 2, 16] : tensor<?x?x128x128xi32> into tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %47 [[0], [1], [2, 3, 4], [5, 6, 7]] : tensor<?x?x8x4x4x4x2x16xi32> into tensor<?x?x128x128xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x128x?x128xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%collapsed : tensor<?x?x128x128xi32>) outs(%48 : tensor<?x128x?x128xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x128x?x128xi32> | |
%collapsed_7 = tensor.collapse_shape %49 [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_7[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c1 = arith.constant 1 : index | |
%c2 = arith.constant 2 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%dim = tensor.dim %expanded, %c0 : tensor<?x128x?x64xi8> | |
%dim_0 = tensor.dim %expanded, %c2 : tensor<?x128x?x64xi8> | |
%expanded_1 = tensor.expand_shape %expanded [[0], [1, 2], [3], [4, 5, 6]] output_shape [%dim, 8, 16, %dim_0, 2, 4, 8] : tensor<?x128x?x64xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%dim_2 = tensor.dim %11, %c0 : tensor<?x?x128x64xi8> | |
%dim_3 = tensor.dim %11, %c1 : tensor<?x?x128x64xi8> | |
%18 = tensor.empty(%dim_2, %dim_3) : tensor<?x?x8x16x2x4x8xi8> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x8x16x?x2x4x8xi8>) outs(%18 : tensor<?x?x8x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x16x2x4x8xi8> | |
%20 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<?x?x8x16x2x4x8xi8>) outs(%20 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%24 = tensor.empty(%22, %23) : tensor<?x?x128x64xi8> | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%27 = tensor.empty(%25, %26) : tensor<?x?xi8> | |
%28 = linalg.fill ins(%c0_i8 : i8) outs(%27 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_4 = tensor.insert_slice %5 into %28[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%29 = arith.divui %25, %c64 : index | |
%30 = arith.divui %26, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%29, 64, %30, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%dim_6 = tensor.dim %expanded_5, %c0 : tensor<?x64x?x128xi8> | |
%dim_7 = tensor.dim %expanded_5, %c2 : tensor<?x64x?x128xi8> | |
%expanded_8 = tensor.expand_shape %expanded_5 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_6, 2, 4, 8, %dim_7, 4, 2, 16] : tensor<?x64x?x128xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%dim_9 = tensor.dim %24, %c0 : tensor<?x?x128x64xi8> | |
%dim_10 = tensor.dim %24, %c1 : tensor<?x?x128x64xi8> | |
%31 = tensor.empty(%dim_9, %dim_10) : tensor<?x?x4x2x16x2x4x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_8 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%31 : tensor<?x?x4x2x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x16x2x4x8xi8> | |
%33 = tensor.empty(%22, %23) : tensor<?x?x4x2x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%35 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%37 = tensor.empty(%35, %36) : tensor<?x?x128x128xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = tensor.empty(%38, %39) : tensor<?x?xi32> | |
%41 = linalg.fill ins(%c0_i32 : i32) outs(%40 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_11 = tensor.insert_slice %8 into %41[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%42 = arith.divui %38, %c128 : index | |
%43 = arith.divui %39, %c128 : index | |
%expanded_12 = tensor.expand_shape %inserted_slice_11 [[0, 1], [2, 3]] output_shape [%42, 128, %43, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%dim_13 = tensor.dim %expanded_12, %c0 : tensor<?x128x?x128xi32> | |
%dim_14 = tensor.dim %expanded_12, %c2 : tensor<?x128x?x128xi32> | |
%expanded_15 = tensor.expand_shape %expanded_12 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_13, 8, 4, 4, %dim_14, 4, 2, 16] : tensor<?x128x?x128xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%dim_16 = tensor.dim %37, %c0 : tensor<?x?x128x128xi32> | |
%dim_17 = tensor.dim %37, %c1 : tensor<?x?x128x128xi32> | |
%44 = tensor.empty(%dim_16, %dim_17) : tensor<?x?x8x4x4x4x2x16xi32> | |
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_15 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%44 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%46 = tensor.empty(%35, %36) : tensor<?x?x8x4x2x4x16x4xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%46 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%48 = iree_gpu.multi_mma %21, %34, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%49 = tensor.empty(%35, %36) : tensor<?x?x8x4x4x4x2x16xi32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%48 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%49 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%51 = tensor.empty(%35, %36) : tensor<?x128x?x128xi32> | |
%dim_18 = tensor.dim %51, %c0 : tensor<?x128x?x128xi32> | |
%dim_19 = tensor.dim %51, %c2 : tensor<?x128x?x128xi32> | |
%52 = tensor.empty(%dim_18, %dim_19) : tensor<?x8x4x4x?x4x2x16xi32> | |
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%50 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%52 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %53 [[0], [1, 2, 3], [4], [5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x128x?x128xi32> | |
%collapsed_20 = tensor.collapse_shape %collapsed [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_20[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%54 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %54 : !hal.buffer_view | |
} | |
// -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c1 = arith.constant 1 : index | |
%c2 = arith.constant 2 : index | |
%c0 = arith.constant 0 : index | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = tensor.empty(%9, %10) : tensor<?x?x128x64xi8> | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%13 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%14 = tensor.empty(%12, %13) : tensor<?x?xi8> | |
%15 = linalg.fill ins(%c0_i8 : i8) outs(%14 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %15[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%16 = arith.divui %12, %c128 : index | |
%17 = arith.divui %13, %c64 : index | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1], [2, 3]] output_shape [%16, 128, %17, 64] : tensor<?x?xi8> into tensor<?x128x?x64xi8> | |
%dim = tensor.dim %expanded, %c0 : tensor<?x128x?x64xi8> | |
%dim_0 = tensor.dim %expanded, %c2 : tensor<?x128x?x64xi8> | |
%expanded_1 = tensor.expand_shape %expanded [[0], [1, 2], [3], [4, 5, 6]] output_shape [%dim, 8, 16, %dim_0, 2, 4, 8] : tensor<?x128x?x64xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%dim_2 = tensor.dim %11, %c0 : tensor<?x?x128x64xi8> | |
%dim_3 = tensor.dim %11, %c1 : tensor<?x?x128x64xi8> | |
%18 = tensor.empty(%dim_2, %dim_3) : tensor<?x?x8x16x2x4x8xi8> | |
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x8x16x?x2x4x8xi8>) outs(%18 : tensor<?x?x8x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x16x2x4x8xi8> | |
%20 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%19 : tensor<?x?x8x16x2x4x8xi8>) outs(%20 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%24 = tensor.empty(%22, %23) : tensor<?x?x128x64xi8> | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%27 = tensor.empty(%25, %26) : tensor<?x?xi8> | |
%28 = linalg.fill ins(%c0_i8 : i8) outs(%27 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_4 = tensor.insert_slice %5 into %28[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%29 = arith.divui %25, %c64 : index | |
%30 = arith.divui %26, %c128 : index | |
%expanded_5 = tensor.expand_shape %inserted_slice_4 [[0, 1], [2, 3]] output_shape [%29, 64, %30, 128] : tensor<?x?xi8> into tensor<?x64x?x128xi8> | |
%dim_6 = tensor.dim %expanded_5, %c0 : tensor<?x64x?x128xi8> | |
%dim_7 = tensor.dim %expanded_5, %c2 : tensor<?x64x?x128xi8> | |
%expanded_8 = tensor.expand_shape %expanded_5 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_6, 2, 4, 8, %dim_7, 4, 2, 16] : tensor<?x64x?x128xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%dim_9 = tensor.dim %24, %c0 : tensor<?x?x128x64xi8> | |
%dim_10 = tensor.dim %24, %c1 : tensor<?x?x128x64xi8> | |
%31 = tensor.empty(%dim_9, %dim_10) : tensor<?x?x4x2x16x2x4x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_8 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%31 : tensor<?x?x4x2x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x16x2x4x8xi8> | |
%33 = tensor.empty(%22, %23) : tensor<?x?x4x2x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%32 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%35 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%37 = tensor.empty(%35, %36) : tensor<?x?x128x128xi32> | |
%38 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%39 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%40 = tensor.empty(%38, %39) : tensor<?x?xi32> | |
%41 = linalg.fill ins(%c0_i32 : i32) outs(%40 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_11 = tensor.insert_slice %8 into %41[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%42 = arith.divui %38, %c128 : index | |
%43 = arith.divui %39, %c128 : index | |
%expanded_12 = tensor.expand_shape %inserted_slice_11 [[0, 1], [2, 3]] output_shape [%42, 128, %43, 128] : tensor<?x?xi32> into tensor<?x128x?x128xi32> | |
%dim_13 = tensor.dim %expanded_12, %c0 : tensor<?x128x?x128xi32> | |
%dim_14 = tensor.dim %expanded_12, %c2 : tensor<?x128x?x128xi32> | |
%expanded_15 = tensor.expand_shape %expanded_12 [[0], [1, 2, 3], [4], [5, 6, 7]] output_shape [%dim_13, 8, 4, 4, %dim_14, 4, 2, 16] : tensor<?x128x?x128xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%dim_16 = tensor.dim %37, %c0 : tensor<?x?x128x128xi32> | |
%dim_17 = tensor.dim %37, %c1 : tensor<?x?x128x128xi32> | |
%44 = tensor.empty(%dim_16, %dim_17) : tensor<?x?x8x4x4x4x2x16xi32> | |
%45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_15 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%44 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%46 = tensor.empty(%35, %36) : tensor<?x?x8x4x2x4x16x4xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%46 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%48 = iree_gpu.multi_mma %21, %34, %47 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%49 = tensor.empty(%35, %36) : tensor<?x?x8x4x4x4x2x16xi32> | |
%50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%48 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%49 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%51 = tensor.empty(%35, %36) : tensor<?x128x?x128xi32> | |
%dim_18 = tensor.dim %51, %c0 : tensor<?x128x?x128xi32> | |
%dim_19 = tensor.dim %51, %c2 : tensor<?x128x?x128xi32> | |
%52 = tensor.empty(%dim_18, %dim_19) : tensor<?x8x4x4x?x4x2x16xi32> | |
%53 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%50 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%52 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %53 [[0], [1, 2, 3], [4], [5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x128x?x128xi32> | |
%collapsed_20 = tensor.collapse_shape %collapsed [[0, 1], [2, 3]] : tensor<?x128x?x128xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed_20[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%54 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %54 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%15, 8, 16, %16, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%17 = tensor.empty(%9, %10) : tensor<?x?x8x16x2x4x8xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%17 : tensor<?x?x8x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x16x2x4x8xi8> | |
%19 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%18 : tensor<?x?x8x16x2x4x8xi8>) outs(%19 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = tensor.empty(%23, %24) : tensor<?x?xi8> | |
%26 = linalg.fill ins(%c0_i8 : i8) outs(%25 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %26[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%27, 2, 4, 8, %28, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%29 = tensor.empty(%21, %22) : tensor<?x?x4x2x16x2x4x8xi8> | |
%30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%29 : tensor<?x?x4x2x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x16x2x4x8xi8> | |
%31 = tensor.empty(%21, %22) : tensor<?x?x4x2x4x16x2x8xi8> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%31 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = tensor.empty(%35, %36) : tensor<?x?xi32> | |
%38 = linalg.fill ins(%c0_i32 : i32) outs(%37 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %38[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%39, 8, 4, 4, %40, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%41 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%41 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%43 = tensor.empty(%33, %34) : tensor<?x?x8x4x2x4x16x4xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%43 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%45 = iree_gpu.multi_mma %20, %32, %44 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = tensor.empty(%33, %34) : tensor<?x?x8x4x4x4x2x16xi32> | |
%47 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%45 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%46 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%48 = tensor.empty(%33, %34) : tensor<?x8x4x4x?x4x2x16xi32> | |
%49 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%47 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%48 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %49 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%50 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %50 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x16x2x4x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d3, d1, d2, d4, d5, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x16x2x4x8xi8> | |
%17 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<?x?x8x16x2x4x8xi8>) outs(%17 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%19 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%20 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%21 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%22 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%23 = tensor.empty(%21, %22) : tensor<?x?xi8> | |
%24 = linalg.fill ins(%c0_i8 : i8) outs(%23 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %24[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%20, 2, 4, 8, %19, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%25 = tensor.empty(%19, %20) : tensor<?x?x4x2x16x2x4x8xi8> | |
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d4, d0, d5, d6, d7, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%25 : tensor<?x?x4x2x16x2x4x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x16x2x4x8xi8> | |
%27 = tensor.empty(%19, %20) : tensor<?x?x4x2x4x16x2x8xi8> | |
%28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26 : tensor<?x?x4x2x16x2x4x8xi8>) outs(%27 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%31 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%32 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%33 = tensor.empty(%31, %32) : tensor<?x?xi32> | |
%34 = linalg.fill ins(%c0_i32 : i32) outs(%33 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %34[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%29, 8, 4, 4, %30, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = tensor.empty(%29, %30) : tensor<?x?x8x4x4x4x2x16xi32> | |
%36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d4, d1, d2, d3, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%35 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%37 = tensor.empty(%29, %30) : tensor<?x?x8x4x2x4x16x4xi32> | |
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%36 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%37 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%39 = iree_gpu.multi_mma %18, %28, %38 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d7, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%39 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%35 : tensor<?x?x8x4x4x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x4x4x2x16xi32> | |
%41 = tensor.empty(%29, %30) : tensor<?x8x4x4x?x4x2x16xi32> | |
%42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%40 : tensor<?x?x8x4x4x4x2x16xi32>) outs(%41 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %42 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%43 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %43 : !hal.buffer_view | |
} | |
// -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%33 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%16 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) { | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%15 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.return %37 : tensor<?x?x8x4x16x2x8xi8> | |
} | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = tensor.empty(%19, %20) : tensor<?x?xi8> | |
%22 = linalg.fill ins(%c0_i8 : i8) outs(%21 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %22[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%18, 2, 4, 8, %17, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%23 = tensor.empty(%17, %18) : tensor<?x?x4x2x4x16x2x8xi8> | |
%24 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}) { | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%23 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.return %37 : tensor<?x?x4x2x4x16x2x8xi8> | |
} | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = tensor.empty(%27, %28) : tensor<?x?xi32> | |
%30 = linalg.fill ins(%c0_i32 : i32) outs(%29 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %30[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%25, 8, 4, 4, %26, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%31 = tensor.empty(%25, %26) : tensor<?x?x8x4x2x4x16x4xi32> | |
%32 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}) { | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%31 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %37 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%33 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}) { | |
%37 = iree_gpu.multi_mma %16, %24, %32 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %37 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%34 = tensor.empty(%25, %26) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}) { | |
%37 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.return %37 : tensor<?x8x4x4x?x4x2x16xi32> | |
} | |
%collapsed = tensor.collapse_shape %35 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%36 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %36 : !hal.buffer_view | |
} | |
// -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) { | |
%33 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%33 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.return %34 : tensor<?x?x8x4x16x2x8xi8> | |
} | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%20 = tensor.empty(%18, %19) : tensor<?x?xi8> | |
%21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%22 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}) { | |
%33 = tensor.empty(%16, %17) : tensor<?x?x4x2x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.return %34 : tensor<?x?x4x2x4x16x2x8xi8> | |
} | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%27 = tensor.empty(%25, %26) : tensor<?x?xi32> | |
%28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%29 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) { | |
%33 = tensor.empty(%23, %24) : tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%33 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %34 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%30 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) { | |
%33 = iree_gpu.multi_mma %15, %22, %29 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %33 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%31 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%23, %24}) { | |
%33 = tensor.empty(%23, %24) : tensor<?x8x4x4x?x4x2x16xi32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%33 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.return %34 : tensor<?x8x4x4x?x4x2x16xi32> | |
} | |
%collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %32 : !hal.buffer_view | |
} | |
// -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%15 = flow.dispatch.region -> (tensor<?x?x8x4x16x2x8xi8>{%9, %10}) { | |
%33 = tensor.empty(%9, %10) : tensor<?x?x8x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded : tensor<?x8x16x?x2x4x8xi8>) outs(%33 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.return %34 : tensor<?x?x8x4x16x2x8xi8> | |
} | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%20 = tensor.empty(%18, %19) : tensor<?x?xi8> | |
%21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_0 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_1 = tensor.expand_shape %inserted_slice_0 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%22 = flow.dispatch.region -> (tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}) { | |
%33 = tensor.empty(%16, %17) : tensor<?x?x4x2x4x16x2x8xi8> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_1 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%33 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.return %34 : tensor<?x?x4x2x4x16x2x8xi8> | |
} | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%27 = tensor.empty(%25, %26) : tensor<?x?xi32> | |
%28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_2 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_3 = tensor.expand_shape %inserted_slice_2 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%29 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) { | |
%33 = tensor.empty(%23, %24) : tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%expanded_3 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%33 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %34 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%30 = flow.dispatch.region -> (tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}) { | |
%33 = iree_gpu.multi_mma %15, %22, %29 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.return %33 : tensor<?x?x8x4x2x4x16x4xi32> | |
} | |
%31 = flow.dispatch.region -> (tensor<?x8x4x4x?x4x2x16xi32>{%23, %24}) { | |
%33 = tensor.empty(%23, %24) : tensor<?x8x4x4x?x4x2x16xi32> | |
%34 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%30 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%33 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.return %34 : tensor<?x8x4x4x?x4x2x16xi32> | |
} | |
%collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %32 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = tensor.empty(%11, %12) : tensor<?x?xi8> | |
%14 = linalg.fill ins(%c0_i8 : i8) outs(%13 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice = tensor.insert_slice %2 into %14[0, 0] [%0, %1] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded = tensor.expand_shape %inserted_slice [[0, 1, 2], [3, 4, 5, 6]] output_shape [%9, 8, 16, %10, 2, 4, 8] : tensor<?x?xi8> into tensor<?x8x16x?x2x4x8xi8> | |
%c0 = arith.constant 0 : index | |
%dim = tensor.dim %expanded, %c0 : tensor<?x8x16x?x2x4x8xi8> | |
%c3 = arith.constant 3 : index | |
%dim_0 = tensor.dim %expanded, %c3 : tensor<?x8x16x?x2x4x8xi8> | |
%15 = flow.dispatch.workgroups(%9, %10, %expanded, %dim, %dim_0, %9, %10) : (index, index, tensor<?x8x16x?x2x4x8xi8>{%dim, %dim_0}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 16, %arg7, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg6, %arg7} -> tensor<?x8x16x?x2x4x8xi8> | |
%34 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x16x2x8xi8> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x8x16x?x2x4x8xi8>) outs(%34 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg8, %arg9} | |
flow.return | |
} | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%18 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%20 = tensor.empty(%18, %19) : tensor<?x?xi8> | |
%21 = linalg.fill ins(%c0_i8 : i8) outs(%20 : tensor<?x?xi8>) -> tensor<?x?xi8> | |
%inserted_slice_1 = tensor.insert_slice %5 into %21[0, 0] [%3, %4] [1, 1] : tensor<?x?xi8> into tensor<?x?xi8> | |
%expanded_2 = tensor.expand_shape %inserted_slice_1 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%17, 2, 4, 8, %16, 4, 2, 16] : tensor<?x?xi8> into tensor<?x2x4x8x?x4x2x16xi8> | |
%c0_3 = arith.constant 0 : index | |
%dim_4 = tensor.dim %expanded_2, %c0_3 : tensor<?x2x4x8x?x4x2x16xi8> | |
%c4 = arith.constant 4 : index | |
%dim_5 = tensor.dim %expanded_2, %c4 : tensor<?x2x4x8x?x4x2x16xi8> | |
%22 = flow.dispatch.workgroups(%16, %17, %expanded_2, %dim_4, %dim_5, %16, %17) : (index, index, tensor<?x2x4x8x?x4x2x16xi8>{%dim_4, %dim_5}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%16, %17} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 2, 4, 8, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg6, %arg7} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%34 = tensor.empty(%arg8, %arg9) : tensor<?x?x4x2x4x16x2x8xi8> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%34 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} | |
flow.return | |
} | |
%23 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%24 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%25 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%27 = tensor.empty(%25, %26) : tensor<?x?xi32> | |
%28 = linalg.fill ins(%c0_i32 : i32) outs(%27 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
%inserted_slice_6 = tensor.insert_slice %8 into %28[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> | |
%expanded_7 = tensor.expand_shape %inserted_slice_6 [[0, 1, 2, 3], [4, 5, 6, 7]] output_shape [%23, 8, 4, 4, %24, 4, 2, 16] : tensor<?x?xi32> into tensor<?x8x4x4x?x4x2x16xi32> | |
%c0_8 = arith.constant 0 : index | |
%dim_9 = tensor.dim %expanded_7, %c0_8 : tensor<?x8x4x4x?x4x2x16xi32> | |
%c4_10 = arith.constant 4 : index | |
%dim_11 = tensor.dim %expanded_7, %c4_10 : tensor<?x8x4x4x?x4x2x16xi32> | |
%29 = flow.dispatch.workgroups(%23, %24, %expanded_7, %dim_9, %dim_11, %23, %24) : (index, index, tensor<?x8x4x4x?x4x2x16xi32>{%dim_9, %dim_11}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%23, %24} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 4, 4, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg6, %arg7} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%34 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x2x4x16x4xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%34 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} | |
flow.return | |
} | |
%30 = flow.dispatch.workgroups(%15, %22, %29, %9, %10, %16, %17, %23, %24) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%16, %17}, tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}, index, index, index, index, index, index) -> %29{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%33 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8> | |
%34 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%35 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%36 = iree_gpu.multi_mma %33, %34, %35 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %36, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} | |
flow.return | |
} | |
%31 = flow.dispatch.workgroups(%23, %24, %30, %23, %24, %23, %24) : (index, index, tensor<?x?x8x4x2x4x16x4xi32>{%23, %24}, index, index, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%23, %24} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%33 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%34 = tensor.empty(%arg8, %arg9) : tensor<?x8x4x4x?x4x2x16xi32> | |
%35 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%33 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%34 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %35, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, 8, 4, 4, %arg9, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
flow.return | |
} | |
%collapsed = tensor.collapse_shape %31 [[0, 1, 2, 3], [4, 5, 6, 7]] : tensor<?x8x4x4x?x4x2x16xi32> into tensor<?x?xi32> | |
%extracted_slice = tensor.extract_slice %collapsed[0, 0] [%6, %7] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> | |
%32 = hal.tensor.export %extracted_slice "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %32 : !hal.buffer_view | |
} | |
// -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups(%2, %13, %0, %1, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg7, %arg8} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg9, %arg10} | |
flow.return | |
} | |
%15 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%11] | |
%16 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%12] | |
%17 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%11] | |
%18 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%12] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups(%9, %10, %19, %15, %16, %9, %10) : (index, index, tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 16, %arg7, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg6, %arg7} -> tensor<?x8x16x?x2x4x8xi8> | |
%55 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x16x2x8xi8> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x16x?x2x4x8xi8>) outs(%55 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg8, %arg9} | |
flow.return | |
} | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups(%5, %25, %3, %4, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg7, %arg8} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg9, %arg10} | |
flow.return | |
} | |
%27 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%23] | |
%28 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%24] | |
%29 = affine.apply affine_map<()[s0] -> (s0 floordiv 64)>()[%23] | |
%30 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%24] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups(%21, %22, %31, %27, %28, %21, %22) : (index, index, tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 2, 4, 8, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg6, %arg7} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%55 = tensor.empty(%arg8, %arg9) : tensor<?x?x4x2x4x16x2x8xi8> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%55 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} | |
flow.return | |
} | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups(%8, %37, %6, %7, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg7, %arg8} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg7, %arg8], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg9, %arg10} | |
flow.return | |
} | |
%39 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%35] | |
%40 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%36] | |
%41 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%35] | |
%42 = affine.apply affine_map<()[s0] -> (s0 floordiv 128)>()[%36] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups(%33, %34, %43, %39, %40, %33, %34) : (index, index, tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, 8, 4, 4, %arg7, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg6, %arg7} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%55 = tensor.empty(%arg8, %arg9) : tensor<?x?x8x4x2x4x16x4xi32> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%55 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} | |
flow.return | |
} | |
%45 = flow.dispatch.workgroups(%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8> | |
%55 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%56 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%57 = iree_gpu.multi_mma %54, %55, %56 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %57, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} | |
flow.return | |
} | |
%46 = flow.dispatch.workgroups(%33, %34, %45, %33, %34, %33, %34) : (index, index, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg8, %arg9} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%55 = tensor.empty(%arg8, %arg9) : tensor<?x8x4x4x?x4x2x16xi32> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%55 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %56, %arg10, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, 8, 4, 4, %arg9, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
flow.return | |
} | |
%47 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%33] | |
%48 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%34] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%33] | |
%51 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%34] | |
%52 = flow.dispatch.workgroups(%49, %6, %7, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg8, %arg9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %54, %arg10, offsets = [0, 0], sizes = [%arg8, %arg9], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg8, %arg9} | |
flow.return | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups(%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8} | |
flow.return | |
} | |
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups(%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 16, %arg5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg4, %arg5} -> tensor<?x8x16x?x2x4x8xi8> | |
%55 = tensor.empty(%arg6, %arg7) : tensor<?x?x8x4x16x2x8xi8> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x16x?x2x4x8xi8>) outs(%55 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} | |
flow.return | |
} | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups(%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8} | |
flow.return | |
} | |
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups(%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 2, 4, 8, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg4, %arg5} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%55 = tensor.empty(%arg6, %arg7) : tensor<?x?x4x2x4x16x2x8xi8> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%55 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg6, %arg7} | |
flow.return | |
} | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups(%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg5, %arg6} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %54, %arg4, offsets = [0, 0], sizes = [%arg5, %arg6], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg7, %arg8} | |
flow.return | |
} | |
%39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups(%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 4, 4, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%55 = tensor.empty(%arg6, %arg7) : tensor<?x?x8x4x2x4x16x4xi32> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%55 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %56, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg6, %arg7} | |
flow.return | |
} | |
%45 = flow.dispatch.workgroups(%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%arg6, %arg7, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} -> tensor<?x?x8x4x16x2x8xi8> | |
%55 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg8, %arg9, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%56 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%57 = iree_gpu.multi_mma %54, %55, %56 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %57, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg10, %arg11, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} | |
flow.return | |
} | |
%46 = flow.dispatch.workgroups(%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, %arg5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg4, %arg5} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%55 = tensor.empty(%arg4, %arg5) : tensor<?x8x4x4x?x4x2x16xi32> | |
%56 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%54 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%55 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %56, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%arg4, 8, 4, 4, %arg5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5} | |
flow.return | |
} | |
%47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%52 = flow.dispatch.workgroups(%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%arg6, %arg7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg4, %arg5} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %54, %arg8, offsets = [0, 0], sizes = [%arg6, %arg7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg6, %arg7} | |
flow.return | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
// -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 16, %55, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%54, %55} -> tensor<?x8x16x?x2x4x8xi8> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x8x4x16x2x8xi8> | |
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x16x?x2x4x8xi8>) outs(%59 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 2, 4, 8, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%54, %55} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x4x2x4x16x2x8xi8> | |
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%59 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x8x4x2x4x16x4xi32> | |
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%59 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%58 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%59 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%60 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8> | |
%61 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%62 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%63 = iree_gpu.multi_mma %60, %61, %62 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %63, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%57 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32> | |
%58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%56 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%57 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %58, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %58, %arg8, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
// -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#map10 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map11 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = affine.apply #map2()[%0] | |
%12 = affine.apply #map3()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = affine.apply #map()[%0] | |
%16 = affine.apply #map1()[%1] | |
%17 = affine.apply #map()[%0] | |
%18 = affine.apply #map1()[%1] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 16, %55, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%54, %55} -> tensor<?x8x16x?x2x4x8xi8> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x8x4x16x2x8xi8> | |
%60 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x16x?x2x4x8xi8>) outs(%59 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = affine.apply #map()[%4] | |
%22 = affine.apply #map1()[%3] | |
%23 = affine.apply #map3()[%3] | |
%24 = affine.apply #map2()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%27 = affine.apply #map1()[%3] | |
%28 = affine.apply #map()[%4] | |
%29 = affine.apply #map1()[%3] | |
%30 = affine.apply #map()[%4] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 2, 4, 8, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%54, %55} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x4x2x4x16x2x8xi8> | |
%60 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%59 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = affine.apply #map()[%6] | |
%34 = affine.apply #map()[%7] | |
%35 = affine.apply #map2()[%6] | |
%36 = affine.apply #map2()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %58, %arg4, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%39 = affine.apply #map()[%6] | |
%40 = affine.apply #map()[%7] | |
%41 = affine.apply #map()[%6] | |
%42 = affine.apply #map()[%7] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%59 = tensor.empty(%56, %57) : tensor<?x?x8x4x2x4x16x4xi32> | |
%60 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%59 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %60, %arg8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%58 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%59 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%60 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8> | |
%61 = flow.dispatch.tensor.load %arg4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%62 = flow.dispatch.tensor.load %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%63 = iree_gpu.multi_mma %60, %61, %62 {indexing_maps = [#map10, #map11, #map12], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %63, %arg5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.tensor.load %arg3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%57 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32> | |
%58 = linalg.generic {indexing_maps = [#map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%56 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%57 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %58, %arg6, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%47 = affine.apply #map2()[%6] | |
%48 = affine.apply #map2()[%7] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply #map2()[%6] | |
%51 = affine.apply #map2()[%7] | |
%52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%58 = flow.dispatch.tensor.load %arg3, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %58, %arg8, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} | |
%55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8} | |
%56 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%56, %57} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10, %17, %18) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} | |
%55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} | |
%56 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 16, %57, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%56, %57} -> tensor<?x8x16x?x2x4x8xi8> | |
%61 = tensor.empty(%58, %59) : tensor<?x?x8x4x16x2x8xi8> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x16x?x2x4x8xi8>) outs(%61 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%arg5, %arg6} | |
%55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%arg7, %arg8} | |
%56 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%56, %57} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22, %29, %30) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} | |
%55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg6, %arg7} | |
%56 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 2, 4, 8, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%56, %57} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%61 = tensor.empty(%58, %59) : tensor<?x?x4x2x4x16x2x8xi8> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%61 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg5, %arg6} | |
%55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%arg7, %arg8} | |
%56 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%56, %57} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%56, %57], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34, %41, %42) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
%55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg6, %arg7} | |
%56 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 4, 4, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%56, %57} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%61 = tensor.empty(%58, %59) : tensor<?x?x8x4x2x4x16x4xi32> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%61 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %62, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%arg6, %arg7} | |
%55 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%arg8, %arg9} | |
%56 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%arg10, %arg11} | |
%57 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%58 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%59 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%60 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%61 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%62 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%63 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%57, %58, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%57, %58} -> tensor<?x?x8x4x16x2x8xi8> | |
%64 = flow.dispatch.tensor.load %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%59, %60, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%59, %60} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%65 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%61, %62, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%61, %62} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%66 = iree_gpu.multi_mma %63, %64, %65 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %66, %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%61, %62, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%61, %62} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%arg4, %arg5} | |
%55 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg4, %arg5} | |
%56 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%58 = flow.dispatch.tensor.load %54, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%56, %57} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%59 = tensor.empty(%56, %57) : tensor<?x8x4x4x?x4x2x16xi32> | |
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%59 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %60, %55, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, 8, 4, 4, %57, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %50, %51, %6, %7, %47, %48) : (tensor<?x?xi32>{%50, %51}, index, index, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg8, %arg9} | |
%55 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%arg6, %arg7} | |
%56 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%57 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%58 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%59 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%60 = flow.dispatch.tensor.load %54, offsets = [0, 0], sizes = [%58, %59], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%56, %57} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %60, %55, offsets = [0, 0], sizes = [%58, %59], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
// -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} | |
%59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
%60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%16 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%19 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%17, %18} | |
%20 = flow.dispatch.workgroups[%15, %16, %9, %10](%19, %15, %16, %9, %10, %17, %18) : (tensor<?x8x16x?x2x4x8xi8>{%15, %16}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} | |
%57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} | |
%58 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%59 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%58, 8, 16, %59, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8> | |
%61 = tensor.empty(%54, %55) : tensor<?x?x8x4x16x2x8xi8> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x16x?x2x4x8xi8>) outs(%61 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%21 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%22 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%23 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%24 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%25 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%23, %24} | |
%26 = flow.dispatch.workgroups[%3, %4, %23, %24](%5, %25, %3, %4, %23, %24) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%23, %24}, index, index, index, index) -> %25{%23, %24} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} | |
%59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
%60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%54, %55} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%27 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%28 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%29 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%30 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%31 = flow.tensor.reshape %26 : tensor<?x?xi8>{%23, %24} -> tensor<?x2x4x8x?x4x2x16xi8>{%29, %30} | |
%32 = flow.dispatch.workgroups[%27, %28, %21, %22](%31, %27, %28, %21, %22, %29, %30) : (tensor<?x2x4x8x?x4x2x16xi8>{%27, %28}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%21, %22} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} | |
%57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%54, %55} | |
%58 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%59 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, 2, 4, 8, %59, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%61 = tensor.empty(%54, %55) : tensor<?x?x4x2x4x16x2x8xi8> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%61 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%34 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%35 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%36 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%37 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%35, %36} | |
%38 = flow.dispatch.workgroups[%6, %7, %35, %36](%8, %37, %6, %7, %35, %36) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%35, %36}, index, index, index, index) -> %37{%35, %36} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%54 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%58 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} | |
%59 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57} | |
%60 = flow.dispatch.tensor.load %58, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%54, %55} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %60, %59, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%56, %57} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%39 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%40 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%41 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%42 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%43 = flow.tensor.reshape %38 : tensor<?x?xi32>{%35, %36} -> tensor<?x8x4x4x?x4x2x16xi32>{%41, %42} | |
%44 = flow.dispatch.workgroups[%39, %40, %33, %34](%43, %39, %40, %33, %34, %41, %42) : (tensor<?x8x4x4x?x4x2x16xi32>{%39, %40}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
%57 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} | |
%58 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%59 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%60 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, 8, 4, 4, %59, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%61 = tensor.empty(%54, %55) : tensor<?x?x8x4x2x4x16x4xi32> | |
%62 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%60 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%61 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %62, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%45 = flow.dispatch.workgroups[%9, %10, %21, %22, %33, %34](%20, %32, %44, %9, %10, %21, %22, %33, %34) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%21, %22}, tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index, index, index, index, index) -> %44{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%54 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%56 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%57 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%58 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%59 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%60 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} | |
%61 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} | |
%62 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} | |
%63 = flow.dispatch.tensor.load %60, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%54, %55} -> tensor<?x?x8x4x16x2x8xi8> | |
%64 = flow.dispatch.tensor.load %61, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%56, %57, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%56, %57} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%65 = flow.dispatch.tensor.load %62, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%66 = iree_gpu.multi_mma %63, %64, %65 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %66, %62, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%58, %59, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%58, %59} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%46 = flow.dispatch.workgroups[%33, %34](%45, %33, %34) : (tensor<?x?x8x4x2x4x16x4xi32>{%33, %34}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} | |
%57 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} | |
%58 = flow.dispatch.tensor.load %56, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, %55, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%54, %55} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%59 = tensor.empty(%54, %55) : tensor<?x8x4x4x?x4x2x16xi32> | |
%60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%58 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%59 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %60, %57, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%54, 8, 4, 4, %55, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%47 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%48 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%49 = flow.tensor.reshape %46 : tensor<?x8x4x4x?x4x2x16xi32>{%33, %34} -> tensor<?x?xi32>{%47, %48} | |
%50 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%51 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%52 = flow.dispatch.workgroups[%50, %51, %6, %7](%49, %6, %7, %47, %48) : (tensor<?x?xi32>{%50, %51}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%54 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%55 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%56 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} | |
%57 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%54, %55} | |
%58 = flow.dispatch.tensor.load %56, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %58, %57, offsets = [0, 0], sizes = [%54, %55], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%54, %55} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%53 = hal.tensor.export %52 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %53 : !hal.buffer_view | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10} | |
%16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20} | |
%22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17} | |
%24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28} | |
%30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%42 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%43 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} | |
%46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
%47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8> | |
%48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28} | |
%36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} | |
%41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %37 : !hal.buffer_view | |
} | |
// -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- // | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%0] | |
%10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%1] | |
%11 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%0] | |
%12 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10} | |
%16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%17 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%4] | |
%18 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%3] | |
%19 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>()[%3] | |
%20 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%4] | |
%21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20} | |
%22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17} | |
%24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%25 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%6] | |
%26 = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%7] | |
%27 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%6] | |
%28 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 128) * 128)>()[%7] | |
%29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28} | |
%30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%42 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%43 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} | |
%46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
%47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8> | |
%48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32> | |
%44 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28} | |
%36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} | |
%41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %37 : !hal.buffer_view | |
} | |
// -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map1 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map2 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map3 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)> | |
#map6 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)> | |
#map7 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)> | |
#map8 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)> | |
#map9 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#map10 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map11 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map12 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map()[%0] | |
%10 = affine.apply #map1()[%1] | |
%11 = affine.apply #map2()[%0] | |
%12 = affine.apply #map3()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch.workgroups[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10} | |
%16 = flow.dispatch.workgroups[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 16, %43, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg8, %arg9} -> tensor<?x8x16x?x2x4x8xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x16x?x2x4x8xi8>) outs(%45 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%17 = affine.apply #map()[%4] | |
%18 = affine.apply #map1()[%3] | |
%19 = affine.apply #map3()[%3] | |
%20 = affine.apply #map2()[%4] | |
%21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20} | |
%22 = flow.dispatch.workgroups[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%38, %39} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17} | |
%24 = flow.dispatch.workgroups[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 2, 4, 8, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg8, %arg9} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x4x2x4x16x2x8xi8> | |
%46 = linalg.generic {indexing_maps = [#map6, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%45 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%25 = affine.apply #map()[%6] | |
%26 = affine.apply #map()[%7] | |
%27 = affine.apply #map2()[%6] | |
%28 = affine.apply #map2()[%7] | |
%29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28} | |
%30 = flow.dispatch.workgroups[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%38 = flow.dispatch.workload.ordinal %arg5, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg6, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg7, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg8, 3 : index | |
%42 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} | |
%43 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
%44 = flow.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%38, %39} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %44, %43, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%40, %41} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%32 = flow.dispatch.workgroups[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} | |
%41 = flow.dispatch.tie_shape %arg10 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%42 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%43 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%44 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, 8, 4, 4, %43, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg8, %arg9} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%45 = tensor.empty(%38, %39) : tensor<?x?x8x4x2x4x16x4xi32> | |
%46 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%44 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%45 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %46, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%33 = flow.dispatch.workgroups[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg4: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg5: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg6: index, %arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index) { | |
%38 = flow.dispatch.workload.ordinal %arg6, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg7, 1 : index | |
%40 = flow.dispatch.workload.ordinal %arg8, 2 : index | |
%41 = flow.dispatch.workload.ordinal %arg9, 3 : index | |
%42 = flow.dispatch.workload.ordinal %arg10, 4 : index | |
%43 = flow.dispatch.workload.ordinal %arg11, 5 : index | |
%44 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} | |
%45 = flow.dispatch.tie_shape %arg4 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} | |
%46 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
%47 = flow.dispatch.tensor.load %44, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%38, %39} -> tensor<?x?x8x4x16x2x8xi8> | |
%48 = flow.dispatch.tensor.load %45, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%40, %41, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%40, %41} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%49 = flow.dispatch.tensor.load %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%50 = iree_gpu.multi_mma %47, %48, %49 {indexing_maps = [#map10, #map11, #map12], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %50, %46, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%42, %43, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%42, %43} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6, %arg7, %arg8 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%34 = flow.dispatch.workgroups[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 0 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 1 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} | |
%41 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, %39, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%38, %39} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%43 = tensor.empty(%38, %39) : tensor<?x8x4x4x?x4x2x16xi32> | |
%44 = linalg.generic {indexing_maps = [#map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%42 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%43 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %44, %41, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%38, 8, 4, 4, %39, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28} | |
%36 = flow.dispatch.workgroups[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} = | |
(%arg3: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%38 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%39 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%40 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} | |
%41 = flow.dispatch.tie_shape %arg8 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
%42 = flow.dispatch.tensor.load %40, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg6, %arg7} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %42, %41, offsets = [0, 0], sizes = [%38, %39], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%38, %39} | |
flow.return | |
} count(%arg3: index, %arg4: index, %arg5: index, %arg6: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg3, %arg4, %arg5, %arg6 | |
flow.return %x, %y, %z : index, index, index | |
} | |
%37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %37 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map9 = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map10 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map11 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map12 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @foo_dispatch_0 { | |
flow.executable.export public @foo_dispatch_0 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_1 { | |
flow.executable.export public @foo_dispatch_1 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_1(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 16, %5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} -> tensor<?x8x16x?x2x4x8xi8> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x8x4x16x2x8xi8> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x16x?x2x4x8xi8>) outs(%7 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_2 { | |
flow.executable.export public @foo_dispatch_2 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_2(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_3 { | |
flow.executable.export public @foo_dispatch_3 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_3(%arg0: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 2, 4, 8, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x4x2x4x16x2x8xi8> | |
%8 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%7 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_4 { | |
flow.executable.export public @foo_dispatch_4 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_4(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_5 { | |
flow.executable.export public @foo_dispatch_5 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_5(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 4, 4, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x8x4x2x4x16x4xi32> | |
%8 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%7 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_6 { | |
flow.executable.export public @foo_dispatch_6 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index | |
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index | |
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} | |
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} | |
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} -> tensor<?x?x8x4x16x2x8xi8> | |
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%2, %3, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%12 = iree_gpu.multi_mma %9, %10, %11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_7 { | |
flow.executable.export public @foo_dispatch_7 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_7(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
%3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%5 = tensor.empty(%0, %1) : tensor<?x8x4x4x?x4x2x16xi32> | |
%6 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%5 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %6, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, 8, 4, 4, %1, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_8 { | |
flow.executable.export public @foo_dispatch_8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} | |
%3 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %4, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map9()[%0] | |
%10 = affine.apply #map10()[%1] | |
%11 = affine.apply #map11()[%0] | |
%12 = affine.apply #map12()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch @foo_dispatch_0::@foo_dispatch_0[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} | |
%15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10} | |
%16 = flow.dispatch @foo_dispatch_1::@foo_dispatch_1[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} | |
%17 = affine.apply #map9()[%4] | |
%18 = affine.apply #map10()[%3] | |
%19 = affine.apply #map12()[%3] | |
%20 = affine.apply #map11()[%4] | |
%21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20} | |
%22 = flow.dispatch @foo_dispatch_2::@foo_dispatch_2[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} | |
%23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17} | |
%24 = flow.dispatch @foo_dispatch_3::@foo_dispatch_3[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} | |
%25 = affine.apply #map9()[%6] | |
%26 = affine.apply #map9()[%7] | |
%27 = affine.apply #map11()[%6] | |
%28 = affine.apply #map11()[%7] | |
%29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28} | |
%30 = flow.dispatch @foo_dispatch_4::@foo_dispatch_4[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} | |
%31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%32 = flow.dispatch @foo_dispatch_5::@foo_dispatch_5[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} | |
%33 = flow.dispatch @foo_dispatch_6::@foo_dispatch_6[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} | |
%34 = flow.dispatch @foo_dispatch_7::@foo_dispatch_7[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28} | |
%36 = flow.dispatch @foo_dispatch_8::@foo_dispatch_8[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} | |
%37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %37 : !hal.buffer_view | |
} | |
} | |
// -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- // | |
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}> | |
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d3, d1, d4, d5, d6)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d5, d3, d4, d6)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d1, d5, d6, d7, d0, d2, d3, d4)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d6, d4, d5, d7)> | |
#map4 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d3, d4, d1, d5, d6, d7)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d5, d6, d3, d7, d4)> | |
#map6 = affine_map<(d0, d1, d2) -> (d0, d2)> | |
#map7 = affine_map<(d0, d1, d2) -> (d1, d2)> | |
#map8 = affine_map<(d0, d1, d2) -> (d0, d1)> | |
#map9 = affine_map<()[s0] -> (s0 ceildiv 128)> | |
#map10 = affine_map<()[s0] -> (s0 ceildiv 64)> | |
#map11 = affine_map<()[s0] -> ((s0 ceildiv 128) * 128)> | |
#map12 = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> | |
#device_target_hip = #hal.device.target<"hip", {legacy_sync}, [#executable_target_rocm_hsaco_fb]> : !hal.device | |
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} { | |
util.global private @__device_0 = #device_target_hip | |
flow.executable private @foo_dispatch_0 { | |
flow.executable.export public @foo_dispatch_0_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_1 { | |
flow.executable.export public @foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 16, %5, 2, 4, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x16x?x2x4x8xi8>>{%arg5, %arg6} -> tensor<?x8x16x?x2x4x8xi8> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x8x4x16x2x8xi8> | |
%8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x16x?x2x4x8xi8>) outs(%7 : tensor<?x?x8x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x8x4x16x2x8xi8> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_2 { | |
flow.executable.export public @foo_dispatch_2_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi8>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %1} -> tensor<?x?xi8> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi8> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi8>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_3 { | |
flow.executable.export public @foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8(%arg0: !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 2, 4, 8, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x2x4x8x?x4x2x16xi8>>{%arg5, %arg6} -> tensor<?x2x4x8x?x4x2x16xi8> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x4x2x4x16x2x8xi8> | |
%8 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x2x4x8x?x4x2x16xi8>) outs(%7 : tensor<?x?x4x2x4x16x2x8xi8>) { | |
^bb0(%in: i8, %out: i8): | |
linalg.yield %in : i8 | |
} -> tensor<?x?x4x2x4x16x2x8xi8> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x4x2x4x16x2x8xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x?x4x2x4x16x2x8xi8>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_4 { | |
flow.executable.export public @foo_dispatch_4_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_4_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { | |
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index | |
%4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} | |
%5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3} | |
%6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %6, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%2, %3} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_5 { | |
flow.executable.export public @foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} | |
%3 = flow.dispatch.tie_shape %arg7 : !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
%4 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%5 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, 8, 4, 4, %5, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x8x4x4x?x4x2x16xi32>>{%arg5, %arg6} -> tensor<?x8x4x4x?x4x2x16xi32> | |
%7 = tensor.empty(%0, %1) : tensor<?x?x8x4x2x4x16x4xi32> | |
%8 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<?x8x4x4x?x4x2x16xi32>) outs(%7 : tensor<?x?x8x4x2x4x16x4xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_6 { | |
flow.executable.export public @foo_dispatch_6 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_6(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>, %arg1: !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>, %arg2: !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index, %arg8: index) { | |
%0 = flow.dispatch.workload.ordinal %arg3, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg4, 1 : index | |
%2 = flow.dispatch.workload.ordinal %arg5, 2 : index | |
%3 = flow.dispatch.workload.ordinal %arg6, 3 : index | |
%4 = flow.dispatch.workload.ordinal %arg7, 4 : index | |
%5 = flow.dispatch.workload.ordinal %arg8, 5 : index | |
%6 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} | |
%7 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} | |
%8 = flow.dispatch.tie_shape %arg2 : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} | |
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x16x2x8xi8>>{%0, %1} -> tensor<?x?x8x4x16x2x8xi8> | |
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%2, %3, 4, 2, 4, 16, 2, 8], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x4x2x4x16x2x8xi8>>{%2, %3} -> tensor<?x?x4x2x4x16x2x8xi8> | |
%11 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%12 = iree_gpu.multi_mma %9, %10, %11 {indexing_maps = [#map6, #map7, #map8], iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>], kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>} : tensor<?x?x8x4x16x2x8xi8>, tensor<?x?x4x2x4x16x2x8xi8> into tensor<?x?x8x4x2x4x16x4xi32> | |
flow.dispatch.tensor.store %12, %8, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%4, %5, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x?x8x4x2x4x16x4xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x8x4x2x4x16x4xi32>>{%4, %5} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_7 { | |
flow.executable.export public @foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32 workgroups(%arg0: index, %arg1: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} | |
%3 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, %1, 8, 4, 2, 4, 16, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4x2x4x16x4xi32>>{%0, %1} -> tensor<?x?x8x4x2x4x16x4xi32> | |
%5 = tensor.empty(%0, %1) : tensor<?x8x4x4x?x4x2x16xi32> | |
%6 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%4 : tensor<?x?x8x4x2x4x16x4xi32>) outs(%5 : tensor<?x8x4x4x?x4x2x16xi32>) { | |
^bb0(%in: i32, %out: i32): | |
linalg.yield %in : i32 | |
} -> tensor<?x8x4x4x?x4x2x16xi32> | |
flow.dispatch.tensor.store %6, %3, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [%0, 8, 4, 4, %1, 4, 2, 16], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<?x8x4x4x?x4x2x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x8x4x4x?x4x2x16xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
flow.executable private @foo_dispatch_8 { | |
flow.executable.export public @foo_dispatch_8_slow_memcpy workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) { | |
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3 | |
flow.return %x, %y, %z : index, index, index | |
} | |
builtin.module { | |
func.func @foo_dispatch_8_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xi32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>) { | |
%0 = flow.dispatch.workload.ordinal %arg1, 2 : index | |
%1 = flow.dispatch.workload.ordinal %arg2, 3 : index | |
%2 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} | |
%3 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1} | |
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%arg3, %arg4} -> tensor<?x?xi32> | |
flow.dispatch.tensor.store %4, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1} | |
return | |
} | |
} | |
} | |
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xi8>, %input1: tensor<?x?xi8>, %input2: tensor<?x?xi32>) -> (%output0: tensor<?x?xi32>)"}} { | |
%c0_i32 = arith.constant 0 : i32 | |
%c0_i8 = arith.constant 0 : i8 | |
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index | |
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index | |
%2 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<?x?xi8>{%0, %1} | |
%3 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index | |
%4 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index | |
%5 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<?x?xi8>{%3, %4} | |
%6 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[0] : index | |
%7 = hal.buffer_view.dim<%arg2 : !hal.buffer_view>[1] : index | |
%8 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<?x?xi32>{%6, %7} | |
%9 = affine.apply #map9()[%0] | |
%10 = affine.apply #map10()[%1] | |
%11 = affine.apply #map11()[%0] | |
%12 = affine.apply #map12()[%1] | |
%13 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%11, %12} | |
%14 = flow.dispatch @foo_dispatch_0::@foo_dispatch_0_slow_memcpy[%0, %1, %11, %12](%2, %13, %0, %1, %11, %12) : (tensor<?x?xi8>{%0, %1}, tensor<?x?xi8>{%11, %12}, index, index, index, index) -> %13{%11, %12} | |
%15 = flow.tensor.reshape %14 : tensor<?x?xi8>{%11, %12} -> tensor<?x8x16x?x2x4x8xi8>{%9, %10} | |
%16 = flow.dispatch @foo_dispatch_1::@foo_dispatch_1_transpose_DxDx8x16x2x4x8_i8[%9, %10, %9, %10](%15, %9, %10, %9, %10, %9, %10) : (tensor<?x8x16x?x2x4x8xi8>{%9, %10}, index, index, index, index, index, index) -> tensor<?x?x8x4x16x2x8xi8>{%9, %10} | |
%17 = affine.apply #map9()[%4] | |
%18 = affine.apply #map10()[%3] | |
%19 = affine.apply #map12()[%3] | |
%20 = affine.apply #map11()[%4] | |
%21 = flow.tensor.splat %c0_i8 : tensor<?x?xi8>{%19, %20} | |
%22 = flow.dispatch @foo_dispatch_2::@foo_dispatch_2_slow_memcpy[%3, %4, %19, %20](%5, %21, %3, %4, %19, %20) : (tensor<?x?xi8>{%3, %4}, tensor<?x?xi8>{%19, %20}, index, index, index, index) -> %21{%19, %20} | |
%23 = flow.tensor.reshape %22 : tensor<?x?xi8>{%19, %20} -> tensor<?x2x4x8x?x4x2x16xi8>{%18, %17} | |
%24 = flow.dispatch @foo_dispatch_3::@foo_dispatch_3_transpose_DxDx4x2x16x2x4x8_i8[%18, %17, %17, %18](%23, %18, %17, %17, %18, %18, %17) : (tensor<?x2x4x8x?x4x2x16xi8>{%18, %17}, index, index, index, index, index, index) -> tensor<?x?x4x2x4x16x2x8xi8>{%17, %18} | |
%25 = affine.apply #map9()[%6] | |
%26 = affine.apply #map9()[%7] | |
%27 = affine.apply #map11()[%6] | |
%28 = affine.apply #map11()[%7] | |
%29 = flow.tensor.splat %c0_i32 : tensor<?x?xi32>{%27, %28} | |
%30 = flow.dispatch @foo_dispatch_4::@foo_dispatch_4_slow_memcpy[%6, %7, %27, %28](%8, %29, %6, %7, %27, %28) : (tensor<?x?xi32>{%6, %7}, tensor<?x?xi32>{%27, %28}, index, index, index, index) -> %29{%27, %28} | |
%31 = flow.tensor.reshape %30 : tensor<?x?xi32>{%27, %28} -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%32 = flow.dispatch @foo_dispatch_5::@foo_dispatch_5_transpose_DxDx8x4x4x4x2x16_i32[%25, %26, %25, %26](%31, %25, %26, %25, %26, %25, %26) : (tensor<?x8x4x4x?x4x2x16xi32>{%25, %26}, index, index, index, index, index, index) -> tensor<?x?x8x4x2x4x16x4xi32>{%25, %26} | |
%33 = flow.dispatch @foo_dispatch_6::@foo_dispatch_6[%9, %10, %17, %18, %25, %26](%16, %24, %32, %9, %10, %17, %18, %25, %26) : (tensor<?x?x8x4x16x2x8xi8>{%9, %10}, tensor<?x?x4x2x4x16x2x8xi8>{%17, %18}, tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index, index, index, index, index) -> %32{%25, %26} | |
%34 = flow.dispatch @foo_dispatch_7::@foo_dispatch_7_transpose_DxDx8x4x4x4x2x16_i32[%25, %26](%33, %25, %26) : (tensor<?x?x8x4x2x4x16x4xi32>{%25, %26}, index, index) -> tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} | |
%35 = flow.tensor.reshape %34 : tensor<?x8x4x4x?x4x2x16xi32>{%25, %26} -> tensor<?x?xi32>{%27, %28} | |
%36 = flow.dispatch @foo_dispatch_8::@foo_dispatch_8_slow_memcpy[%27, %28, %6, %7](%35, %6, %7, %27, %28) : (tensor<?x?xi32>{%27, %28}, index, index, index, index) -> tensor<?x?xi32>{%6, %7} | |
%37 = hal.tensor.export %36 "output0" : tensor<?x?xi32>{%6, %7} -> !hal.buffer_view | |
util.return %37 : !hal.buffer_view | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment