Skip to content

Instantly share code, notes, and snippets.

commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8
Author: Benoit Jacob <[email protected]>
Date: Tue Jun 10 09:04:02 2025 -0700
lower-barriers
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
index 53a82acdd7..a2e32b9620 100644
commit 5a9ea4399bdf0eafa4136b86e97831b7da6279b3
Author: Benoit Jacob <[email protected]>
Date: Thu Jun 19 19:37:00 2025 -0700
bufferization-fixes
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index 7ca1f240f4..714f9bf681 100644
hal.executable public @prefill_bs4$async_dispatch_23 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "+sramecc,-xnack", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_wo
index 9b8292d052..ddc1fbc04d 100644
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -925,6 +925,9 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
+ %c16384 = arith.constant 16384 : index
+ %c32768 = arith.constant 32768 : index
+
.LBB0_3:
ds_read2st64_b64 v[108:111], v91 offset1:4
ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12
ds_read_b64 v[148:149], v83
ds_read_b64 v[150:151], v84
ds_read_b64 v[152:153], v85
ds_read_b64 v[154:155], v86
ds_read2st64_b64 v[116:119], v92 offset1:4
ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12
ds_read_b64 v[156:157], v89
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
index 53a82acdd7..d3d72d62c6 100644
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -33,6 +33,11 @@
!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!unified_flat_shared_f8 = memref<49152xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!flat_shared_f8_at_offset_16384 = memref<32768xf8E4M3FNUZ, strided<[1], offset: 16384>, #gpu.address_space<workgroup>>
- rocprofv3 configuration:
- advanced_thread_trace = True
- agent_index = None
- att_buffer_size = 0x60000000
- att_library_path = ['/home/benjacob/gbaraldi']
- att_parse = trace
- att_perfcounter_ctrl = None
- att_perfcounters = None
- att_serialize_all = False
@bjacob
bjacob / a.txt
Created May 14, 2025 16:39
rocprof-compute profile with zero strides
__ _
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|_| |_|
INFO Analysis mode = cli
INFO [analysis] deriving rocprofiler-compute metrics...
➜ llama3 git:(main) ✗ /home/benjacob/TheRock/build/profiler/rocprofiler-sdk/dist/bin/rocprofv3 -i ~/att.json -d $HOME/traces -- /home/benjacob/iree-build/tools/iree-benchmark-module --device=hip://0 --device_allocator=caching --hip_use_streams=true --module=/home/benjacob/iree-model-benchmark/llama3/tmp/base.8b_fp8.vmfb --parameters=model=8b_fp8.irpa --function=prefill_bs4 --input=4x2048xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_token_ids_4x2048xi64.bin --input=4xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_lens_4xi64.bin --input=4x64xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_block_ids_4x64xi64.bin --input=261x2097152xf8E4M3FNUZ=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin --benchmark_repetitions=3
E20250513 07:41:48.067751 131443649423296 output_stream.cpp:108] Opened result file: /home/benjacob/traces/Sh
func.func @matmul(%lhs: tensor<?x4096xf8E4M3FNUZ>, %rhs: tensor<4096x4096xf8E4M3FNUZ>) -> tensor<?x4096xf32> {
%c0 = arith.constant 0 : index
%c256 = arith.constant 256 : index
%m = tensor.dim %lhs, %c0 : tensor<?x4096xf8E4M3FNUZ>
%m_outer = arith.divsi %m, %c256 : index
%lhs_expanded = tensor.expand_shape %lhs [[0, 1], [2]] output_shape [%m_outer, 256, 4096] : tensor<?x4096xf8E4M3FNUZ> into tensor<?x256x4096xf8E4M3FNUZ>
%init_acc = tensor.empty(%m_outer) : tensor<?x256x4096xf32>
%c0_acc_type = arith.constant 0.0: f32
%acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x256x4096xf32>) -> tensor<?x256x4096xf32>
%result_expanded = linalg.generic {