Last active
April 24, 2025 21:31
-
-
Save AmosLewis/b6cc009e135fa0a0664ae5d1ea16b3f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run on SharkMI300X | |
# cd /sharedfile/attn/bisect | |
#./export_run_build.sh | |
# # Check if a command-line argument is provided | |
if [ -z "$1" ]; then | |
iree_day="042411" | |
echo "No flag provided. Using default iree_day $iree_day." | |
else | |
iree_day="$1" | |
fi | |
if [ -z "$2" ]; then | |
shark_day="042411" | |
echo "No flag provided. Using default shark_day $shark_day." | |
else | |
shark_day="$2" | |
fi | |
# irpa_path=/sharedfile/attn/fp8_attn.irpa | |
irpa_path=/shark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa | |
mlir_path_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.mlir" | |
config_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.json" | |
vmfb_128="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.vmfb" | |
benchmark_128_prefill="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_128_decode="/sharedfile/attn/128/out/fp8_attn_iree$iree_day.shark$shark_day.decode.txt" | |
create_file_if_not_exists() { | |
local FILE="$1" | |
if [ ! -f "$FILE" ]; then | |
touch "$FILE" | |
echo "File created: $FILE" | |
else | |
echo "File already exists: $FILE" | |
fi | |
} | |
echo $mlir_path_128 | |
echo $config_128 | |
echo $vmfb_128 | |
echo $benchmark_128_prefill | |
create_file_if_not_exists $benchmark_128_prefill | |
echo $benchmark_128_decode | |
create_file_if_not_exists $benchmark_128_decode | |
# python3 \ | |
# -m \ | |
# sharktank.examples.export_paged_llm_v1 \ | |
# --irpa-file=/shark-dev/8b/fp8/attnf8/native_fp8_e4m3fnuz_llama3_8b.irpa \ | |
# --output-mlir=/home/chi/src/shark-ai/2025-04-13/llama-8b/fp8_attnf8_128_tp1.mlir \ | |
# --output-config=/home/chi/src/shark-ai/2025-04-13/llama-8b/fp8_attnf8_128_tp1.json \ | |
# --bs-prefill=4 \ | |
# --bs-decode=4 \ | |
# --block-seq-stride=32 \ | |
# --attention-dtype=float8_e4m3fnuz \ | |
# --activation-dtype=bfloat16 \ | |
# --kv-cache-dtype=float8_e4m3fnuz \ | |
# --attention-kernel=sharktank \ | |
# --use-attention-mask \ | |
# --use-hf \ | |
python3 -m sharktank.examples.export_paged_llm_v1 \ | |
--irpa-file=$irpa_path \ | |
--output-mlir=$mlir_path_128 \ | |
--output-config=$config_128 \ | |
--bs-prefill=4 \ | |
--bs-decode=4 \ | |
--attention-kerne=sharktank \ | |
--attention-dtype=float8_e4m3fnuz \ | |
--activation-dtype=bfloat16 \ | |
--use-attention-mask \ | |
--use-hf \ | |
--kv-cache-dtype=float8_e4m3fnuz | |
/home/chi/src/iree-build/tools/iree-compile \ | |
$mlir_path_128 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_128 \ | |
--iree-hal-target-device=hip \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
/home/chi/src/iree-build/tools/iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_prefill | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
/home/chi/src/iree-build/tools/iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_128 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://4 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64=@/sharedfile/128/decode/decode_next_tokens_4x1xi64.bin \ | |
--input=4xi64=@/sharedfile/128/decode/decode_seq_lens_4xi64.bin \ | |
--input=4xi64=@/sharedfile/128/decode/decode_start_positions_4xi64.bin \ | |
--input=4x5xi64=@/sharedfile/128/decode/decode_seq_block_ids_tensor_4x5xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_128_decode | |
###########################################2048############################################################################################### | |
mlir_path_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.mlir" | |
config_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.json" | |
vmfb_2048="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.vmfb" | |
benchmark_2048_prefill="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.prefill.txt" | |
benchmark_2048_decode="/sharedfile/attn/2048/out/fp8_attn_iree$iree_day.shark$shark_day.decode.txt" | |
echo $mlir_path_2048 | |
echo $config_2048 | |
echo $vmfb_2048 | |
echo $benchmark_2048_prefill | |
create_file_if_not_exists $benchmark_2048_prefill | |
echo $benchmark_2048_decode | |
create_file_if_not_exists $benchmark_2048_decode | |
# echo $mlir_path_2048 | |
python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=$irpa_path \ | |
--output-mlir=$mlir_path_2048 \ | |
--output-config=$config_2048 \ | |
--bs-prefill=4 --bs-decode=4 --attention-kernel sharktank \ | |
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf --kv-cache-dtype=float8_e4m3fnuz | |
/home/chi/src/iree-build/tools/iree-compile \ | |
$mlir_path_2048 \ | |
--iree-hip-target=gfx942 \ | |
-o=$vmfb_2048\ | |
--iree-hal-target-device=hip \ | |
--iree-opt-level=O3 \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
/home/chi/src/iree-build/tools/iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x64xi64=@/sharedfile/2048/prefill/prefill_seq_block_ids_4x64xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_2048_prefill | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
/home/chi/src/iree-build/tools/iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=$vmfb_2048 \ | |
--parameters=model=$irpa_path \ | |
--device=hip://4 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64=@/sharedfile/2048/decode/decode_next_tokens_4x1xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/decode/decode_seq_lens_4xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/decode/decode_start_positions_4xi64.bin \ | |
--input=4x65xi64=@/sharedfile/2048/decode/decode_seq_block_ids_tensor_4x65xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=10 \ | |
| tee $benchmark_2048_decode |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment