It's in this branch / single commit: https://github.com/iree-org/iree/compare/main...bjacob:iree:gpu_matmul_benchmarks
experimental/gpu_matmul_benchmarks/benchmark.sh
experimental/gpu_matmul_benchmarks/matmul_f16.mlir
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
index 56f5a927cc..3959baec1b 100644 | |
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
@@ -693,33 +693,22 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc, | |
case MMAIntrinsic::VMFMA_F32_16x16x32_F16: | |
case MMAIntrinsic::VMFMA_F32_32x32x16_F16: { | |
// Generate mfma's for K with unrolled kernels. | |
- const int64_t unrollKFactor = 2; | |
- auto [m, n, k] = getMNKShape(); |
commit 8bd7e0c21a5571928e3d918076dd598fc8d2f3b9 | |
Author: Benoit Jacob <[email protected]> | |
Date: Thu Nov 14 05:51:31 2024 -0800 | |
fix | |
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
index a89f7fced8..9b661effb5 100644 | |
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp |
commit f09ea33589816270efc12e9a5371da8b80cd6e2b | |
Author: Benoit Jacob <[email protected]> | |
Date: Thu Nov 14 02:11:18 2024 -0800 | |
more fixes | |
Signed-off-by: Benoit Jacob <[email protected]> | |
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
index 4406de2e9f..a89f7fced8 100644 |
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | |
@@ -987,6 +987,29 @@ LogicalResult DataTiledMMAAttr::populateOperandOffsetsSizesStrides( | |
return d.kind == TileSwizzle::Dim::Kind::CrossThread; | |
}); | |
+ // Adjustment needed on RDNA3 where the same data is read by 2 threads and the | |
+ // intrinsic thread-grid is correspondingly 2x smaller than subgroup size. | |
+ // We can't recover that from the unrolled tileSwizzle, where these intrinsic | |
+ // level dimensions are mixed with expansion to multiple subgroups, so we have |
It's in this branch / single commit: https://github.com/iree-org/iree/compare/main...bjacob:iree:gpu_matmul_benchmarks
experimental/gpu_matmul_benchmarks/benchmark.sh
experimental/gpu_matmul_benchmarks/matmul_f16.mlir
tools/iree-compile /tmp/x/module_foo_dispatch_6.mlir --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-opt-data-tiling --iree-global-opt-experimental-rocm-data-tiling --iree-global-opt-enable-early-materialization=true -o /tmp/a.vmfb --compile-from=executable-sources -mlir-disable-threading -mlir-print-ir-after-all 2>/tmp/log
%1093 = extractelement <8 x i8> %785, i32 0 | |
%1094 = zext i8 %1093 to i64 | |
%1095 = shl i64 %1094, 0 | |
%1096 = or i64 0, %1095 | |
%1097 = extractelement <8 x i8> %785, i32 1 | |
%1098 = zext i8 %1097 to i64 | |
%1099 = shl i64 %1098, 8 | |
%1100 = or i64 %1096, %1099 | |
%1101 = extractelement <8 x i8> %785, i32 2 | |
%1102 = zext i8 %1101 to i64 |
.text | |
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.globl foo_dispatch_6 | |
.p2align 8 | |
.type foo_dispatch_6,@function | |
foo_dispatch_6: | |
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. | |
.fill 63, 4, 0xbf800000 ; s_nop 0 | |
s_load_dwordx2 s[20:21], s[0:1], 0x50 |
.text | |
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.globl foo_dispatch_6 | |
.p2align 8 | |
.type foo_dispatch_6,@function | |
foo_dispatch_6: | |
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. | |
.fill 63, 4, 0xbf800000 ; s_nop 0 | |
s_load_dwordx2 s[20:21], s[0:1], 0x50 |
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- // | |
module { | |
func.func @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> { | |
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32> | |
return %0 : tensor<?x?xi32> | |
} | |
} | |
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- // |