Created
June 20, 2025 15:27
-
-
Save bjacob/4e7249de88fd00310f50020c459256fc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8 | |
Author: Benoit Jacob <[email protected]> | |
Date: Tue Jun 10 09:04:02 2025 -0700 | |
lower-barriers | |
Signed-off-by: Benoit Jacob <[email protected]> | |
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
index 53a82acdd7..a2e32b9620 100644 | |
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
@@ -136,8 +136,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter { | |
@@ -147,8 +147,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of rhs. | |
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in | |
@@ -164,8 +164,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 { | |
@@ -175,8 +175,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
@@ -184,8 +184,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 { | |
@@ -195,8 +195,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
@@ -208,8 +208,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 { | |
@@ -219,8 +219,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_ | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
scf.yield %dot3 : vector<8x4x1x4xf32> | |
} | |
@@ -357,8 +357,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter { | |
@@ -368,8 +368,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of rhs. | |
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in | |
@@ -385,8 +385,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 { | |
@@ -396,8 +396,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
@@ -405,8 +405,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16> | |
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 { | |
@@ -416,8 +416,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
@@ -429,8 +429,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 { | |
@@ -440,8 +440,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in | |
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
scf.yield %dot3 : vector<8x4x1x4xf32> | |
} | |
@@ -584,8 +584,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ> | |
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter { | |
@@ -595,8 +595,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of rhs. | |
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8 | |
@@ -612,8 +612,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ> | |
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 { | |
@@ -623,8 +623,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ> | |
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ> | |
@@ -632,8 +632,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ> | |
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 { | |
@@ -643,8 +643,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
@@ -656,8 +656,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 { | |
@@ -667,8 +667,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas | |
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
scf.yield %dot3 : vector<8x4x1x4xf32> | |
} | |
@@ -799,7 +799,7 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
%lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16> | |
%rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of rhs. | |
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in | |
@@ -812,14 +812,14 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
%rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 8] [1, 1] : !block_in to tensor<1x8xf16> | |
%rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp, vector<4x1x2x4xf16> | |
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x2x4xf16> | |
%lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16> | |
%rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of lhs. | |
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 64] [1, 1, 1] : !mexp_in_ty to !mexp_block_in | |
@@ -828,8 +828,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
%lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 8] [1, 1, 1] : !mexp_block_in to tensor<1x1x8xf16> | |
%lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x8xf16>, vector<1x8xf16> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot0 = iree_gpu.multi_mma %lhs_vec_0_t, %rhs_vec_0_t, %iter { | |
@@ -839,8 +839,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
} : vector<4x2x1x4xf16>, vector<4x2x1x4xf16> into vector<4x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared | |
@@ -850,8 +850,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !mshared | |
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !mshared | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot2 = iree_gpu.multi_mma %lhs_vec_2_t, %rhs_vec_2_t, %dot0 { | |
@@ -861,8 +861,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: ! | |
} : vector<4x2x1x4xf16>, vector<4x2x1x4xf16> into vector<4x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
scf.yield %dot2 : vector<4x4x1x4xf32> | |
} | |
@@ -993,7 +993,7 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
%lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ> | |
%rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of rhs. | |
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8 | |
@@ -1006,14 +1006,14 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
%rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ> | |
%rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ> | |
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ> | |
%lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ> | |
%rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ> | |
- rocdl.sched.barrier 0 | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
// Global loads of lhs. | |
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 128] [1, 1, 1] : !mexp_in_ty_f8 to !mexp_block_in_f8 | |
@@ -1022,8 +1022,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
%lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ> | |
%lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ> | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot0 = iree_gpu.multi_mma %lhs_vec_0_t, %rhs_vec_0_t, %iter { | |
@@ -1033,8 +1033,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
} : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ rocdl.s.barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8 | |
@@ -1044,8 +1044,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8 | |
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 } | |
%dot2 = iree_gpu.multi_mma %lhs_vec_2_t, %rhs_vec_2_t, %dot0 { | |
@@ -1055,8 +1055,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
} : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32> | |
rocdl.s.setprio 0 | |
- gpu.barrier | |
- rocdl.sched.barrier 0 | |
+ amdgpu.lds_barrier | |
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses. | |
scf.yield %dot2 : vector<4x4x1x4xf32> | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment