Skip to content

Instantly share code, notes, and snippets.

@bjacob
bjacob / a.txt
Created September 11, 2025 15:08
➜ iree-model-benchmark git:(main) llama3/compile-405b-fp4.sh gfx950
+ /home/benjacob/iree/iree-model-benchmark/llama3/compile-405b-fp4-base.sh /home/benjacob/iree-build/tools/iree-compile gfx950 /home/benjacob/iree/iree-model-benchmark/llama3/base_ir/405b_fp4_asm.mlir -o /home/benjacob/iree/iree-model-benchmark/llama3/tmp/base.405b_fp4.vmfb
+ COMPILER_FLAGS=("--iree-hip-target=$CHIP" "--iree-hal-target-device=hip" "--iree-opt-level=O3" "--iree-dispatch-creation-propagate-collapse-across-expands=true" "--iree-hal-indirect-command-buffers=true" "--iree-stream-resource-memory-model=discrete" "--iree-hip-enable-tensor-ukernels" "--iree-hip-specialize-dispatches" "--iree-hal-memoization=true" "--iree-stream-affinity-solver-max-iterations=1024")
+ (( 0 == 1 ))
+ /home/benjacob/iree-build/tools/iree-compile /home/benjacob/iree/iree-model-benchmark/llama3/base_ir/405b_fp4_asm.mlir --iree-hip-target=gfx950 --iree-hal-target-device=hip --iree-opt-level=O3 --iree-dispatch-creation-propagate-collapse-across-expands=tru
@bjacob
bjacob / a.cc
Created September 5, 2025 18:42
z-order
std::tuple<int, int> zorder(int M, int N, int i) {
if (M == 1) {
return {0, i};
}
if (N == 1) {
return {i, 0};
}
int Mhalf = M/2;
int Nhalf = N/2;
int quadrant0 = Mhalf * Nhalf;
commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8
Author: Benoit Jacob <[email protected]>
Date: Tue Jun 10 09:04:02 2025 -0700
lower-barriers
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
index 53a82acdd7..a2e32b9620 100644
commit 5a9ea4399bdf0eafa4136b86e97831b7da6279b3
Author: Benoit Jacob <[email protected]>
Date: Thu Jun 19 19:37:00 2025 -0700
bufferization-fixes
Signed-off-by: Benoit Jacob <[email protected]>
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index 7ca1f240f4..714f9bf681 100644
hal.executable public @prefill_bs4$async_dispatch_23 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "+sramecc,-xnack", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_wo
index 9b8292d052..ddc1fbc04d 100644
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -925,6 +925,9 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c256 = arith.constant 256 : index
+ %c16384 = arith.constant 16384 : index
+ %c32768 = arith.constant 32768 : index
+
.LBB0_3:
ds_read2st64_b64 v[108:111], v91 offset1:4
ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12
ds_read_b64 v[148:149], v83
ds_read_b64 v[150:151], v84
ds_read_b64 v[152:153], v85
ds_read_b64 v[154:155], v86
ds_read2st64_b64 v[116:119], v92 offset1:4
ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12
ds_read_b64 v[156:157], v89
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
index 53a82acdd7..d3d72d62c6 100644
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
@@ -33,6 +33,11 @@
!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!unified_flat_shared_f8 = memref<49152xf8E4M3FNUZ, #gpu.address_space<workgroup>>
+!flat_shared_f8_at_offset_16384 = memref<32768xf8E4M3FNUZ, strided<[1], offset: 16384>, #gpu.address_space<workgroup>>
- rocprofv3 configuration:
- advanced_thread_trace = True
- agent_index = None
- att_buffer_size = 0x60000000
- att_library_path = ['/home/benjacob/gbaraldi']
- att_parse = trace
- att_perfcounter_ctrl = None
- att_perfcounters = None
- att_serialize_all = False
@bjacob
bjacob / a.txt
Created May 14, 2025 16:39
rocprof-compute profile with zero strides
__ _
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|_| |_|
INFO Analysis mode = cli
INFO [analysis] deriving rocprofiler-compute metrics...