This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit f664a86e16fe10fdd4bdac54b106cc86cc2592e8 | |
Author: Benoit Jacob <[email protected]> | |
Date: Tue Jun 10 09:04:02 2025 -0700 | |
lower-barriers | |
Signed-off-by: Benoit Jacob <[email protected]> | |
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
index 53a82acdd7..a2e32b9620 100644 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit 5a9ea4399bdf0eafa4136b86e97831b7da6279b3 | |
Author: Benoit Jacob <[email protected]> | |
Date: Thu Jun 19 19:37:00 2025 -0700 | |
bufferization-fixes | |
Signed-off-by: Benoit Jacob <[email protected]> | |
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp | |
index 7ca1f240f4..714f9bf681 100644 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hal.executable public @prefill_bs4$async_dispatch_23 { | |
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "+sramecc,-xnack", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_wo |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
index 9b8292d052..ddc1fbc04d 100644 | |
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
@@ -925,6 +925,9 @@ util.func private @pingpong_medium_f8_expanded(%lhs_base: !mexp_in_ty_f8, %rhs_b | |
%c64 = arith.constant 64 : index | |
%c128 = arith.constant 128 : index | |
%c256 = arith.constant 256 : index | |
+ %c16384 = arith.constant 16384 : index | |
+ %c32768 = arith.constant 32768 : index | |
+ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.LBB0_3: | |
ds_read2st64_b64 v[108:111], v91 offset1:4 | |
ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12 | |
ds_read_b64 v[148:149], v83 | |
ds_read_b64 v[150:151], v84 | |
ds_read_b64 v[152:153], v85 | |
ds_read_b64 v[154:155], v86 | |
ds_read2st64_b64 v[116:119], v92 offset1:4 | |
ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12 | |
ds_read_b64 v[156:157], v89 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
index 53a82acdd7..d3d72d62c6 100644 | |
--- a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
+++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_gfx942.mlir | |
@@ -33,6 +33,11 @@ | |
!mshared_f8 = memref<128x128xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
!mshared_exp_f8 = memref<8x16x4x32xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
+!unified_flat_shared_f8 = memref<49152xf8E4M3FNUZ, #gpu.address_space<workgroup>> | |
+!flat_shared_f8_at_offset_16384 = memref<32768xf8E4M3FNUZ, strided<[1], offset: 16384>, #gpu.address_space<workgroup>> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- rocprofv3 configuration: | |
- advanced_thread_trace = True | |
- agent_index = None | |
- att_buffer_size = 0x60000000 | |
- att_library_path = ['/home/benjacob/gbaraldi'] | |
- att_parse = trace | |
- att_perfcounter_ctrl = None | |
- att_perfcounters = None | |
- att_serialize_all = False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__ _ | |
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___ | |
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \ | |
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/ | |
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___| | |
|_| |_| | |
[32mINFO[0m Analysis mode = cli | |
[32mINFO[0m [analysis] deriving rocprofiler-compute metrics... |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
➜ llama3 git:(main) ✗ /home/benjacob/TheRock/build/profiler/rocprofiler-sdk/dist/bin/rocprofv3 -i ~/att.json -d $HOME/traces -- /home/benjacob/iree-build/tools/iree-benchmark-module --device=hip://0 --device_allocator=caching --hip_use_streams=true --module=/home/benjacob/iree-model-benchmark/llama3/tmp/base.8b_fp8.vmfb --parameters=model=8b_fp8.irpa --function=prefill_bs4 --input=4x2048xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_token_ids_4x2048xi64.bin --input=4xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_lens_4xi64.bin --input=4x64xi64=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_seq_block_ids_4x64xi64.bin --input=261x2097152xf8E4M3FNUZ=@/home/benjacob/iree-model-benchmark/llama3/inputs/8b_fp8/args_bs4_2048/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin --benchmark_repetitions=3 | |
E20250513 07:41:48.067751 131443649423296 output_stream.cpp:108] Opened result file: /home/benjacob/traces/Sh |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func.func @matmul(%lhs: tensor<?x4096xf8E4M3FNUZ>, %rhs: tensor<4096x4096xf8E4M3FNUZ>) -> tensor<?x4096xf32> { | |
%c0 = arith.constant 0 : index | |
%c256 = arith.constant 256 : index | |
%m = tensor.dim %lhs, %c0 : tensor<?x4096xf8E4M3FNUZ> | |
%m_outer = arith.divsi %m, %c256 : index | |
%lhs_expanded = tensor.expand_shape %lhs [[0, 1], [2]] output_shape [%m_outer, 256, 4096] : tensor<?x4096xf8E4M3FNUZ> into tensor<?x256x4096xf8E4M3FNUZ> | |
%init_acc = tensor.empty(%m_outer) : tensor<?x256x4096xf32> | |
%c0_acc_type = arith.constant 0.0: f32 | |
%acc = linalg.fill ins(%c0_acc_type : f32) outs(%init_acc : tensor<?x256x4096xf32>) -> tensor<?x256x4096xf32> | |
%result_expanded = linalg.generic { |
NewerOlder