Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active March 27, 2025 21:54
Show Gist options
  • Save AmosLewis/f9f71aec560170fcf58a48ec96a6e893 to your computer and use it in GitHub Desktop.
Save AmosLewis/f9f71aec560170fcf58a48ec96a6e893 to your computer and use it in GitHub Desktop.
# ssh chi@SharkMi300X
# iree-3.4.0rc20250327
# build iree with tracy
git checkout iree-3.4.0rc20250327
cmake -G Ninja -B ../iree-build-trace/ -S . \
-DCMAKE_BUILD_TYPE=Release \
-DIREE_ENABLE_ASSERTIONS=ON \
-DIREE_ENABLE_SPLIT_DWARF=ON \
-DIREE_ENABLE_THIN_ARCHIVES=ON \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DIREE_BUILD_PYTHON_BINDINGS=ON \
-DPython3_EXECUTABLE="$(which python)" \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DIREE_ENABLE_RUNTIME_TRACING=ON \
-DIREE_BUILD_TRACY=ON \
-DIREE_TARGET_BACKEND_ROCM=ON \
-DIREE_HAL_DRIVER_HIP=ON \
-DIREE_ENABLE_LLD=ON
cmake --build ../iree-build-trace/
###################################################################################################
# shark-ai 0327 export mlir
# shark-ai commit:
# 698bceab2de5705884d6fbde41ad4f908a7a00c2
# Change ShardedTensor.clone to error out on wrong args (#1179)
###################################################################################################
# python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/attn/fp8_attn.irpa \
# --output-mlir=/sharedfile/attn/128/fp8_attn.mlir \
# --output-config=/sharedfile/attn/128/config_attn.json \
# --bs-prefill=4 --bs-decode=4 --attention-kernel sharktank \
# --attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf --kv-cache-dtype=float8_e4m3fnuz
# compile since iree-3.4.0rc20250327
# the irpa file:
# wget https://sharkblobs.blob.core.windows.net/chi/llama_8b_fp8_attn
# the mlir file get 31ms
# wget https://sharkpublic.blob.core.windows.net/sharkpublic/chi/llama/atten/fp8_attn_0327.mlir
# the mlir file get 25.6ms
# wget https://sharkpublic.blob.core.windows.net/sharkpublic/chi/llama/atten/fp8_attn_i907_0320.mlir
/home/chi/src/iree-build-trace/tools/iree-compile \
/sharedfile/attn/128/fp8_attn_0327.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/attn/128/fp8_attn_tracy_iree0327_mlir0327.vmfb \
--iree-hal-target-device=hip \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-hal-executable-debug-level=3 \
--iree-hal-dump-executable-sources-to=dump
# all inputs in sharkpublic/chi/llama/input/
TRACY_NO_EXIT=1 \
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build-trace/tools/iree-run-module \
--hip_use_streams=true \
--module=/sharedfile/attn/128/fp8_attn_tracy_iree0327_mlir0327.vmfb \
--parameters=model=/sharedfile/attn/fp8_attn.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin
# EXEC @prefill_bs4
# result[0]: hal.buffer_view
# 4x128x128256xf32=[[2.38743 1.13533 0.0316274 0.404722 -2.59714 -0.0234363 4.38464 4.32849 0.552629 0.267567 -2.72748 5.84252 2.77527 4.22566 2.19176 -2.30531 0.604199 0.899531 -1.10129 1.12569 0.869303 2.23187 -0.0645586 0.158095 -0.140919 -0.0600918 1.4782 0.74488 -0.94031 0.467049 1.19404 2.37265 -0.148914 1.62083 1.7 ...
# ...][...][...]]
# another terminal run and get
(.venv) ➜ tracy /home/chi/src/iree-build-trace/tracy/iree-tracy-capture -f -o 8b_fp8_prefill_bs4_128_iree0327_mlir0327.tracy
# Connecting to 127.0.0.1:8086...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment