June 20, 2025 15:27 · June 20, 2025 03:10 · June 6, 2025 19:04 · June 6, 2025 16:44 · June 2, 2025 18:28 · May 15, 2025 20:26
 commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8
 Author: Benoit Jacob <[email protected]>
 Date:   Tue Jun 10 09:04:02 2025 -0700

    lower-barriers
    
    Signed-off-by: Benoit Jacob <[email protected]>

 diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 index 53a82acdd7..a2e32b9620 100644
 commit 5a9ea4399bdf0eafa4136b86e97831b7da6279b3
 Author: Benoit Jacob <[email protected]>
 Date:   Thu Jun 19 19:37:00 2025 -0700

    bufferization-fixes
    
    Signed-off-by: Benoit Jacob <[email protected]>

 diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
 index 7ca1f240f4..714f9bf681 100644
 hal.executable public @prefill_bs4$async_dispatch_23 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "+sramecc,-xnack", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_wo
 index 9b8292d052..ddc1fbc04d 100644
 --- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 +++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 @@ -925,6 +925,9 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
 +  %c16384 = arith.constant 16384 : index
 +  %c32768 = arith.constant 32768 : index
 +  
 .LBB0_3:
 	ds_read2st64_b64 v[108:111], v91 offset1:4
 	ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12
 	ds_read_b64 v[148:149], v83
 	ds_read_b64 v[150:151], v84
 	ds_read_b64 v[152:153], v85
 	ds_read_b64 v[154:155], v86
 	ds_read2st64_b64 v[116:119], v92 offset1:4
 	ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12
 	ds_read_b64 v[156:157], v89
 diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 index 53a82acdd7..d3d72d62c6 100644
 --- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 +++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
 @@ -33,6 +33,11 @@
 !mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
 !mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>
 
 +!unified_flat_shared_f8 = memref<49152xf8E4M3FNUZ, #gpu.address_space<workgroup>>
 +!flat_shared_f8_at_offset_16384 = memref<32768xf8E4M3FNUZ, strided<[1], offset: 16384>, #gpu.address_space<workgroup>>

 - rocprofv3 configuration:
    - advanced_thread_trace       = True
    - agent_index                 = None
    - att_buffer_size             = 0x60000000
    - att_library_path            = ['/home/benjacob/gbaraldi']
    - att_parse                   = trace
    - att_perfcounter_ctrl        = None
    - att_perfcounters            = None
    - att_serialize_all           = False

                                 __                                       _
 _ __ ___   ___ _ __  _ __ ___  / _|       ___ ___  _ __ ___  _ __  _   _| |_ ___
 | '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
 | | | (_) | (__| |_) | | | (_) |  _|_____| (_| (_) | | | | | | |_) | |_| | ||  __/
 |_|  \___/ \___| .__/|_|  \___/|_|        \___\___/|_| |_| |_| .__/ \__,_|\__\___|
               |_|                                           |_|

   [32mINFO[0m Analysis mode = cli
   [32mINFO[0m [analysis] deriving rocprofiler-compute metrics...
 ➜  llama3 git:(main) ✗ /home/benjacob/TheRock/build/profiler/rocprofiler-sdk/dist/bin/rocprofv3  -i ~/att.json -d $HOME/traces -- /home/benjacob/iree-build/tools/iree-benchmark-module --device=hip://0 --device_allocator=caching --hip_use_streams=true --module=/home/benjacob/iree-model-benchmark/llama3/tmp/base.8b_fp8.vmfb --parameters=model=8b_fp8.irpa --function=prefill_bs4 --input=4x2048xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_token_ids_4x2048xi64.bin --input=4xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_lens_4xi64.bin --input=4x64xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_block_ids_4x64xi64.bin --input=261x2097152xf8E4M3FNUZ=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin --benchmark_repetitions=3
 E20250513 07:41:48.067751 131443649423296 output_stream.cpp:108] Opened result file: /home/benjacob/traces/Sh
 func.func @matmul(%lhs: tensor<?x4096xf8E4M3FNUZ>, %rhs: tensor<4096x4096xf8E4M3FNUZ>) -> tensor<?x4096xf32> {
  %c0 = arith.constant 0 : index
  %c256 = arith.constant 256 : index
  %m = tensor.dim %lhs, %c0 : tensor<?x4096xf8E4M3FNUZ>
  %m_outer = arith.divsi %m, %c256 : index
  %lhs_expanded = tensor.expand_shape %lhs [[0, 1], [2]] output_shape [%m_outer, 256, 4096] : tensor<?x4096xf8E4M3FNUZ> into tensor<?x256x4096xf8E4M3FNUZ>
  %init_acc = tensor.empty(%m_outer) : tensor<?x256x4096xf32>
  %c0_acc_type = arith.constant 0.0: f32 
  %acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x256x4096xf32>) -> tensor<?x256x4096xf32>
  %result_expanded = linalg.generic {
	commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8
	Author: Benoit Jacob <[email protected]>
	Date: Tue Jun 10 09:04:02 2025 -0700

	lower-barriers

	Signed-off-by: Benoit Jacob <[email protected]>

	diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	index 53a82acdd7..a2e32b9620 100644
	commit 5a9ea4399bdf0eafa4136b86e97831b7da6279b3
	Author: Benoit Jacob <[email protected]>
	Date: Thu Jun 19 19:37:00 2025 -0700

	bufferization-fixes

	Signed-off-by: Benoit Jacob <[email protected]>

	diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
	index 7ca1f240f4..714f9bf681 100644
	index 9b8292d052..ddc1fbc04d 100644
	--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	@@ -925,6 +925,9 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b
	%c64 = arith.constant 64 : index
	%c128 = arith.constant 128 : index
	%c256 = arith.constant 256 : index
	+ %c16384 = arith.constant 16384 : index
	+ %c32768 = arith.constant 32768 : index
	+
	.LBB0_3:
	ds_read2st64_b64 v[108:111], v91 offset1:4
	ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12
	ds_read_b64 v[148:149], v83
	ds_read_b64 v[150:151], v84
	ds_read_b64 v[152:153], v85
	ds_read_b64 v[154:155], v86
	ds_read2st64_b64 v[116:119], v92 offset1:4
	ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12
	ds_read_b64 v[156:157], v89
	diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	index 53a82acdd7..d3d72d62c6 100644
	--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir
	@@ -33,6 +33,11 @@
	!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>>
	!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>>

	+!unified_flat_shared_f8 = memref<49152xf8E4M3FNUZ, #gpu.address_space<workgroup>>
	+!flat_shared_f8_at_offset_16384 = memref<32768xf8E4M3FNUZ, strided<[1], offset: 16384>, #gpu.address_space<workgroup>>

	- rocprofv3 configuration:
	- advanced_thread_trace = True
	- agent_index = None
	- att_buffer_size = 0x60000000
	- att_library_path = ['/home/benjacob/gbaraldi']
	- att_parse = trace
	- att_perfcounter_ctrl = None
	- att_perfcounters = None
	- att_serialize_all = False

	__ _
	_ __ ___ ___ _ __ _ __ ___ / _\| ___ ___ _ __ ___ _ __ _ _\| \|_ ___
	\| '__/ _ \ / __\| '_ \\| '__/ _ \\| \|_ _____ / __/ _ \\| '_ ` _ \\| '_ \\| \| \| \| __/ _ \
	\| \| \| (_) \| (__\| \|_) \| \| \| (_) \| _\|_____\| (_\| (_) \| \| \| \| \| \| \|_) \| \|_\| \| \|\| __/
	\|_\| \___/ \___\| .__/\|_\| \___/\|_\| \___\___/\|_\| \|_\| \|_\| .__/ \__,_\|\__\___\|
	\|_\| \|_\|

	[32mINFO[0m Analysis mode = cli
	[32mINFO[0m [analysis] deriving rocprofiler-compute metrics...
	func.func @matmul(%lhs: tensor<?x4096xf8E4M3FNUZ>, %rhs: tensor<4096x4096xf8E4M3FNUZ>) -> tensor<?x4096xf32> {
	%c0 = arith.constant 0 : index
	%c256 = arith.constant 256 : index
	%m = tensor.dim %lhs, %c0 : tensor<?x4096xf8E4M3FNUZ>
	%m_outer = arith.divsi %m, %c256 : index
	%lhs_expanded = tensor.expand_shape %lhs [[0, 1], [2]] output_shape [%m_outer, 256, 4096] : tensor<?x4096xf8E4M3FNUZ> into tensor<?x256x4096xf8E4M3FNUZ>
	%init_acc = tensor.empty(%m_outer) : tensor<?x256x4096xf32>
	%c0_acc_type = arith.constant 0.0: f32
	%acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x256x4096xf32>) -> tensor<?x256x4096xf32>
	%result_expanded = linalg.generic {