Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created June 20, 2025 15:27
Show Gist options
  • Save bjacob/4e7249de88fd00310f50020c459256fc to your computer and use it in GitHub Desktop.
Save bjacob/4e7249de88fd00310f50020c459256fc to your computer and use it in GitHub Desktop.
commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8
Author: Benoit Jacob <[email protected]>
Date: Tue Jun 10 09:04:02 2025 -0700
lower-barriers
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
index 53a82acdd7..a2e32b9620 100644
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -136,8 +136,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter {
@@ -147,8 +147,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in
@@ -164,8 +164,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 {
@@ -175,8 +175,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
@@ -184,8 +184,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 {
@@ -195,8 +195,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -208,8 +208,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 {
@@ -219,8 +219,8 @@ util.func private @pingpong_large(%lhs_base: !in_ty, %rhs_base: !in_ty, %unused_
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
scf.yield %dot3 : vector<8x4x1x4xf32>
}
@@ -357,8 +357,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter {
@@ -368,8 +368,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in
@@ -385,8 +385,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 {
@@ -396,8 +396,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
@@ -405,8 +405,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<8x1x1x4xf16>
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x1x4xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 {
@@ -416,8 +416,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -429,8 +429,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 {
@@ -440,8 +440,8 @@ util.func private @pingpong_large_expanded(%lhs_base: !exp_in_ty, %rhs_base: !in
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
scf.yield %dot3 : vector<8x4x1x4xf32>
}
@@ -584,8 +584,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot0 = iree_gpu.multi_mma %lhs_vec_0, %rhs_vec_0, %iter {
@@ -595,8 +595,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
@@ -612,8 +612,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot1 = iree_gpu.multi_mma %lhs_vec_1, %rhs_vec_1, %dot0 {
@@ -623,8 +623,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
@@ -632,8 +632,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<8x1x1x8xf8E4M3FNUZ>
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x1x8xf8E4M3FNUZ>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot2 = iree_gpu.multi_mma %lhs_vec_2, %rhs_vec_2, %dot1 {
@@ -643,8 +643,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
@@ -656,8 +656,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot3 = iree_gpu.multi_mma %lhs_vec_3, %rhs_vec_3, %dot2 {
@@ -667,8 +667,8 @@ util.func private @pingpong_large_f8_expanded(%lhs_base: !exp_in_ty_f8, %rhs_bas
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
scf.yield %dot3 : vector<8x4x1x4xf32>
}
@@ -799,7 +799,7 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
%lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16>
%rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 64] [1, 1] : !in_ty to !block_in
@@ -812,14 +812,14 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
%rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 8] [1, 1] : !block_in to tensor<1x8xf16>
%rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x8xf16>, vector<1x8xf16>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp, vector<4x1x2x4xf16>
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp, vector<4x1x2x4xf16>
%lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16>
%rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x4xf16> to vector<4x2x1x4xf16>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of lhs.
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 64] [1, 1, 1] : !mexp_in_ty to !mexp_block_in
@@ -828,8 +828,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
%lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1, %gko] [1, 1, 8] [1, 1, 1] : !mexp_block_in to tensor<1x1x8xf16>
%lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x8xf16>, vector<1x8xf16>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot0 = iree_gpu.multi_mma %lhs_vec_0_t, %rhs_vec_0_t, %iter {
@@ -839,8 +839,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
} : vector<4x2x1x4xf16>, vector<4x2x1x4xf16> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !shared
@@ -850,8 +850,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !mshared
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x8xf16>, !mshared
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot2 = iree_gpu.multi_mma %lhs_vec_2_t, %rhs_vec_2_t, %dot0 {
@@ -861,8 +861,8 @@ util.func private @pingpong_medium_expanded(%lhs_base: !mexp_in_ty, %rhs_base: !
} : vector<4x2x1x4xf16>, vector<4x2x1x4xf16> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
scf.yield %dot2 : vector<4x4x1x4xf32>
}
@@ -993,7 +993,7 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
%lhs_vec_0_t = vector.transpose %lhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
%rhs_vec_0_t = vector.transpose %rhs_vec_0, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of rhs.
%rhs_block = tensor.extract_slice %rhs [0, %i] [256, 128] [1, 1] : !in_ty_f8 to !block_in_f8
@@ -1006,14 +1006,14 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
%rhs_thread_3 = tensor.extract_slice %rhs_block [%glb3, %gko] [1, 16] [1, 1] : !block_in_f8 to tensor<1x16xf8E4M3FNUZ>
%rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !mshared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
%rhs_vec_2 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !shared_exp_f8, vector<4x1x2x8xf8E4M3FNUZ>
%lhs_vec_2_t = vector.transpose %lhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
%rhs_vec_2_t = vector.transpose %rhs_vec_2, [0, 2, 1, 3] : vector<4x1x2x8xf8E4M3FNUZ> to vector<4x2x1x8xf8E4M3FNUZ>
- rocdl.sched.barrier 0
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
// Global loads of lhs.
%lhs_block = tensor.extract_slice %lhs [0, 0, %i] [1, 128, 128] [1, 1, 1] : !mexp_in_ty_f8 to !mexp_block_in_f8
@@ -1022,8 +1022,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
%lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 16] [1, 1, 1] : !mexp_block_in_f8 to tensor<1x1x16xf8E4M3FNUZ>
%lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x16xf8E4M3FNUZ>, vector<1x16xf8E4M3FNUZ>
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot0 = iree_gpu.multi_mma %lhs_vec_0_t, %rhs_vec_0_t, %iter {
@@ -1033,8 +1033,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
} : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ rocdl.s.barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
vector.transfer_write %rhs_vec_local_1, %rhs_shared [%glb1, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !shared_f8
@@ -1044,8 +1044,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x16xf8E4M3FNUZ>, !mshared_f8
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
%dot2 = iree_gpu.multi_mma %lhs_vec_2_t, %rhs_vec_2_t, %dot0 {
@@ -1055,8 +1055,8 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
} : vector<4x2x1x8xf8E4M3FNUZ>, vector<4x2x1x8xf8E4M3FNUZ> into vector<4x4x1x4xf32>
rocdl.s.setprio 0
- gpu.barrier
- rocdl.sched.barrier 0
+ amdgpu.lds_barrier
+ rocdl.sched.barrier 6 // Only SALU and VALU can be reordered across. Not MFMA, not memory accesses.
scf.yield %dot2 : vector<4x4x1x4xf32>
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment