Skip to content

Instantly share code, notes, and snippets.

python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/llama3.1/8b/fp16/weight/8b_fp16.irpa --output-mlir=/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_fp16irpa_kvfp8.mlir --output-config=/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_fp16irpa_kvfp8.json --bs-prefill=4 --bs-decode=4 --attention-kernel=sharktank --attention-dtype=float16 --activation-dtype=float16 --use-attention-mask --use-hf --kv-cache-dtype=float8_e4m3fn
/home/chiliu12/src/shark-ai/.venv12/lib/python3.12/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wr
((.venv12) ) ➜ shark-ai git:(2bb2d590b) ✗ /sharedfile/f8/export_run_f8_8b_tp1.sh
No flag provided. Using default iree_day 0828.
No flag provided. Using default shark_day 0828_2bb_kv16.
/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.mlir
/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.json
/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.vmfb
/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.prefill.txt
File already exists: /sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.prefill.txt
/sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.decode.txt
File already exists: /sharedfile/f8/128/out/fp8_attn_iree0828.shark0828_2bb_kv16.decode.txt
((.venv12) ) ➜ shark-ai git:(main) ✗ /sharedfile/f4/export_run_f4_405b_pp1_bs4.sh
No flag provided. Using default iree_day 0902.
No flag provided. Using default shark_day 0903_4aa.
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.mlir
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.json
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.vmfb
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.prefill.txt
File created: /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.prefill.txt
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.decode.txt
File created: /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0902.shark0903_4aa.decode.txt
((.venv12) ) ➜ shark-ai git:(main) ✗ /sharedfile/f4/export_run_f4_405b_pp1_bs4.sh
No flag provided. Using default iree_day 0903.
No flag provided. Using default shark_day 0903_4aa.
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.mlir
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.json
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.vmfb
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.prefill.txt
File already exists: /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.prefill.txt
/sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.decode.txt
File already exists: /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0903.shark0903_4aa.decode.txt
root@smci355-ccs-aus-n10-09:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_405b_config_fp4.json
Warning: Missing argument '--test-mode'
Info: Defaulting to test mode 'PerformanceOnly'
Warning: Missing argument '--test-scenario'
Info: Defaulting to test scenario 'Offline'
INFO:root:####################################################################################################################################################################################
Running python3.11 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --prefill_bs 4 --decode_bs 4 --user_conf_path user.conf --count 50 --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly --debug False --verbose False --user_conf_path user.conf --shortfin_config shortfin_405b_config_fp4.json
##########################################################################################################################################
(.venv) ➜ wave git:(main) /sharedfile/f4/export_run_f4_405b_pp1_bs4.sh
No flag provided. Using default iree_day 0822.
No flag provided. Using default shark_day 0822_add_wave_gemm_optims.
export model: python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/shark-dev/llama3.1/405b/instruct/weights/fp4/fp4_2025_07_10_fn.irpa --output-mlir=/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0822.shark0822_add_wave_gemm_optims.mlir --output-config=/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0822.shark0822_add_wave_gemm_optims.json --bs-prefill=4 --bs-decode=4 --block-seq-stride=32 --attention-dtype=float16 --activation-dtype=float16 --attention-kernel=torch --kv-cache-dtype=float8_e4m3fn --use-hf --top-k=1
/home/chiliu12/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined

nod-ai/shark-ai#2088

python -m sharktank.tools.run_llm_vmfb \
--prompt /data/mlperf_llama/artifacts/chi/real_inputs/prompt_input_128.txt \
--irpa /shark-dev/llama3.1/405b/instruct/weights/fp4/fp4_2025_07_10_fn.irpa \
--vmfb /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0827.shark0827_554.vmfb \
--config /sharedfile/f4/2500/405b/pp1/out/f4_bs4_ds4.iree0827.shark0827_554.json \
--tokenizer /shark-dev/llama3.1/405b/instruct/weights/fp4/tokenizer.json \
--tokenizer_config /shark-dev/llama3.1/405b/instruct/weights/fp4/tokenizer_config.json \
(.venv) ➜ shark-ai git:(main) ✗ /sharedfile/f4/export_run_f4_405b_pp1_bs4.sh
No flag provided. Using default iree_day 0818.
No flag provided. Using default shark_day 0820.
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.mlir
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.json
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.prefill.vmfb
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.prefill.vmfb
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.prefill.txt
File created: /sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.prefill.txt
/sharedfile/f4/128/405b/pp1/out/f4_bs4_ds4.iree0818.shark0820.decode.txt
INFO:eval
{
"perplexities": [
3.323654,
6.722357,
2.652504,
9.565708
],
"mean_perplexity": 5.566056
}
root@smci355-ccs-aus-n10-09:/mlperf/harness# ./run_offline.sh --shortfin-config shortfin_8b_config_fp16.json
Warning: Missing argument '--test-mode'
Info: Defaulting to test mode 'PerformanceOnly'
Warning: Missing argument '--test-scenario'
Info: Defaulting to test scenario 'Offline'
INFO:root:####################################################################################################################################################################################
Running python3.11 harness_alt_mi355.py --devices 0,1,2,3,4,5,6,7 --scenario Offline --test_mode PerformanceOnly --prefill_bs 4 --decode_bs 4 --user_conf_path user.conf --tensor_path /data/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl --logfile_outdir OutputOfflinePerformanceOnly --debug False --verbose False --user_conf_path user.conf --shortfin_config shortfin_8b_config_fp16.json
#######################################################################################################################################################