Skip to content

Instantly share code, notes, and snippets.

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 56f5a927cc..3959baec1b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -693,33 +693,22 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
case MMAIntrinsic::VMFMA_F32_16x16x32_F16:
case MMAIntrinsic::VMFMA_F32_32x32x16_F16: {
// Generate mfma's for K with unrolled kernels.
- const int64_t unrollKFactor = 2;
- auto [m, n, k] = getMNKShape();
commit 8bd7e0c21a5571928e3d918076dd598fc8d2f3b9
Author: Benoit Jacob <[email protected]>
Date: Thu Nov 14 05:51:31 2024 -0800
fix
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index a89f7fced8..9b661effb5 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
commit f09ea33589816270efc12e9a5371da8b80cd6e2b
Author: Benoit Jacob <[email protected]>
Date: Thu Nov 14 02:11:18 2024 -0800
more fixes
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 4406de2e9f..a89f7fced8 100644
@bjacob
bjacob / a.diff
Last active November 4, 2024 17:14
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -987,6 +987,29 @@ LogicalResult DataTiledMMAAttr::populateOperandOffsetsSizesStrides(
return d.kind == TileSwizzle::Dim::Kind::CrossThread;
});
+ // Adjustment needed on RDNA3 where the same data is read by 2 threads and the
+ // intrinsic thread-grid is correspondingly 2x smaller than subgroup size.
+ // We can't recover that from the unrolled tileSwizzle, where these intrinsic
+ // level dimensions are mixed with expansion to multiple subgroups, so we have
@bjacob
bjacob / README.md
Created October 9, 2024 18:19
Early benchmarking of GPU data-tiled matmuls
@bjacob
bjacob / README.md
Last active October 7, 2024 15:16
Bad codegen for `vector<8xi8>` operands to MFMA intrinsics
tools/iree-compile /tmp/x/module_foo_dispatch_6.mlir --iree-hal-target-backends=rocm --iree-hip-target=gfx942 --iree-opt-data-tiling --iree-global-opt-experimental-rocm-data-tiling --iree-global-opt-enable-early-materialization=true -o /tmp/a.vmfb --compile-from=executable-sources -mlir-disable-threading -mlir-print-ir-after-all 2>/tmp/log
%1093 = extractelement <8 x i8> %785, i32 0
%1094 = zext i8 %1093 to i64
%1095 = shl i64 %1094, 0
%1096 = or i64 0, %1095
%1097 = extractelement <8 x i8> %785, i32 1
%1098 = zext i8 %1097 to i64
%1099 = shl i64 %1098, 8
%1100 = or i64 %1096, %1099
%1101 = extractelement <8 x i8> %785, i32 2
%1102 = zext i8 %1101 to i64
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.globl foo_dispatch_6
.p2align 8
.type foo_dispatch_6,@function
foo_dispatch_6:
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
.fill 63, 4, 0xbf800000 ; s_nop 0
s_load_dwordx2 s[20:21], s[0:1], 0x50
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.globl foo_dispatch_6
.p2align 8
.type foo_dispatch_6,@function
foo_dispatch_6:
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
.fill 63, 4, 0xbf800000 ; s_nop 0
s_load_dwordx2 s[20:21], s[0:1], 0x50
// -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
module {
func.func @foo(%arg0: tensor<?x?xi8>, %arg1: tensor<?x?xi8>, %arg2: tensor<?x?xi32>) -> tensor<?x?xi32> {
%0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%arg2 : tensor<?x?xi32>) -> tensor<?x?xi32>
return %0 : tensor<?x?xi32>
}
}
// -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //