Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active April 24, 2025 21:31
Show Gist options
  • Save AmosLewis/b6cc009e135fa0a0664ae5d1ea16b3f0 to your computer and use it in GitHub Desktop.
Save AmosLewis/b6cc009e135fa0a0664ae5d1ea16b3f0 to your computer and use it in GitHub Desktop.
# run on SharkMI300X
# cd /sharedfile/attn/bisect
#./export_run_build.sh
# # Check if a command-line argument is provided
if [ -z "$1" ]; then
iree_day="042411"
echo "No flag provided. Using default iree_day $iree_day."
else
iree_day="$1"
fi
if [ -z "$2" ]; then
shark_day="042411"
echo "No flag provided. Using default shark_day $shark_day."
else
shark_day="$2"
fi
# irpa_path=/sharedfile/attn/fp8_attn.irpa
irpa_path=/shark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa
mlir_path_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.mlir"
config_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.json"
vmfb_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.vmfb"
benchmark_128_prefill="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_128_decode="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.decode.txt"
create_file_if_not_exists() {
local FILE="$1"
if [ ! -f "$FILE" ]; then
touch "$FILE"
echo "File created: $FILE"
else
echo "File already exists: $FILE"
fi
}
echo $mlir_path_128
echo $config_128
echo $vmfb_128
echo $benchmark_128_prefill
create_file_if_not_exists $benchmark_128_prefill
echo $benchmark_128_decode
create_file_if_not_exists $benchmark_128_decode
# python3 \
# -m \
# sharktank.examples.export_paged_llm_v1 \
# --irpa-file=/shark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \
# --output-mlir=/home/chi/src/shark-ai/2025-04-13/llama-8b/fp8_attnf8_128_tp1.mlir \
# --output-config=/home/chi/src/shark-ai/2025-04-13/llama-8b/fp8_attnf8_128_tp1.json \
# --bs-prefill=4 \
# --bs-decode=4 \
# --block-seq-stride=32 \
# --attention-dtype=float8_e4m3fnuz \
# --activation-dtype=bfloat16 \
# --kv-cache-dtype=float8_e4m3fnuz \
# --attention-kernel=sharktank \
# --use-attention-mask \
# --use-hf \
python3 -m sharktank.examples.export_paged_llm_v1 \
--irpa-file=$irpa_path \
--output-mlir=$mlir_path_128 \
--output-config=$config_128 \
--bs-prefill=4 \
--bs-decode=4 \
--attention-kerne=sharktank \
--attention-dtype=float8_e4m3fnuz \
--activation-dtype=bfloat16 \
--use-attention-mask \
--use-hf \
--kv-cache-dtype=float8_e4m3fnuz
/home/chi/src/iree-build/tools/iree-compile \
$mlir_path_128 \
--iree-hip-target=gfx942 \
-o=$vmfb_128 \
--iree-hal-target-device=hip \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build/tools/iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128 \
--parameters=model=$irpa_path \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=10 \
| tee $benchmark_128_prefill
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build/tools/iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_128 \
--parameters=model=$irpa_path \
--device=hip://4 \
--function=decode_bs4 \
--input=4x1xi64=@/sharedfile/128/decode/decode_next_tokens_4x1xi64.bin \
--input=4xi64=@/sharedfile/128/decode/decode_seq_lens_4xi64.bin \
--input=4xi64=@/sharedfile/128/decode/decode_start_positions_4xi64.bin \
--input=4x5xi64=@/sharedfile/128/decode/decode_seq_block_ids_tensor_4x5xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=10 \
| tee $benchmark_128_decode
###########################################2048###############################################################################################
mlir_path_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.mlir"
config_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.json"
vmfb_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.vmfb"
benchmark_2048_prefill="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.prefill.txt"
benchmark_2048_decode="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.decode.txt"
echo $mlir_path_2048
echo $config_2048
echo $vmfb_2048
echo $benchmark_2048_prefill
create_file_if_not_exists $benchmark_2048_prefill
echo $benchmark_2048_decode
create_file_if_not_exists $benchmark_2048_decode
# echo $mlir_path_2048
python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=$irpa_path \
--output-mlir=$mlir_path_2048 \
--output-config=$config_2048 \
--bs-prefill=4 --bs-decode=4 --attention-kernel sharktank \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf --kv-cache-dtype=float8_e4m3fnuz
/home/chi/src/iree-build/tools/iree-compile \
$mlir_path_2048 \
--iree-hip-target=gfx942 \
-o=$vmfb_2048\
--iree-hal-target-device=hip \
--iree-opt-level=O3 \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build/tools/iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048 \
--parameters=model=$irpa_path \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
--input=4x64xi64=@/sharedfile/2048/prefill/prefill_seq_block_ids_4x64xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=10 \
| tee $benchmark_2048_prefill
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build/tools/iree-benchmark-module \
--hip_use_streams=true \
--module=$vmfb_2048 \
--parameters=model=$irpa_path \
--device=hip://4 \
--function=decode_bs4 \
--input=4x1xi64=@/sharedfile/2048/decode/decode_next_tokens_4x1xi64.bin \
--input=4xi64=@/sharedfile/2048/decode/decode_seq_lens_4xi64.bin \
--input=4xi64=@/sharedfile/2048/decode/decode_start_positions_4xi64.bin \
--input=4x65xi64=@/sharedfile/2048/decode/decode_seq_block_ids_tensor_4x65xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=10 \
| tee $benchmark_2048_decode
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment