Last active
February 12, 2025 18:59
-
-
Save AmosLewis/b3252e6fe7ec287928562c0e3199cf5e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Patch used: https://github.com/nod-ai/shark-ai/pull/896 | |
IREE version==3.2.0rc20250209 | |
instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa | |
instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json | |
# Generated the input.bin | |
# Although the run fail, it does generated the input.bin for prefill.(TBD) | |
# For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD) | |
# https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba | |
python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \ | |
--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t" | |
# decode_cache_state_261x2097152xf8E4M3FNUZ.bin decode_seq_block_ids_tensor_4x6xi64.bin decode_start_positions_4xi64.bin prefill_seq_lens_4xi64.bin | |
# decode_next_tokens_4x1xi64.bin decode_seq_block_ids_tensor_4x7xi64.bin prefill_cache_state_261x2097152xf8E4M3FNUZ.bin prefill_token_ids_4x128xi64.bin | |
# decode_seq_block_ids_tensor_4x5xi64.bin decode_seq_lens_4xi64.bin prefill_seq_block_ids_4x4xi64.bin | |
# Generate mlir file | |
python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \ | |
--output-mlir=/sharedfile/128/fp8_128.mlir \ | |
--output-config=/sharedfile/128/config_128.json \ | |
--bs=4 --attention-kernel torch \ | |
# --attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 this 2 flag might not the one we should use for fp8_atten16.mlir | |
# /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) | |
# return torch.from_numpy(wrapper) | |
# Exporting prefill_bs4 | |
# /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/torch/_export/non_strict_utils.py:520: UserWarning: Tensor.T is deprecated on 0-D tensors. This function is the identity in these cases. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3691.) | |
# return func(*args, **kwargs) | |
# Exporting decode_bs4 | |
# GENERATED! | |
# Exporting | |
# Saving to '/sharedfile/2048/fp8_128.mlir' | |
# f8_ perplexity_iree | |
python -m sharktank.evaluate.perplexity_iree \ | |
--irpa-file=/sharedfile/llama3_8b_fp8.irpa \ | |
--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json \ | |
--iree-device='hip://4' \ | |
--iree-hal-target-device=hip \ | |
--iree-hip-target=gfx942 \ | |
--attention-kernel decomposed \ | |
--num-prompts=1 | |
# iree comiple and run without tracy bin | |
iree-compile /sharedfile/128/fp8_128.mlir \ | |
--iree-hip-target=gfx942 \ | |
-o=/sharedfile/128/fp8_128.vmfb \ | |
--iree-hal-target-device=hip \ | |
--iree-dispatch-creation-enable-aggressive-fusion=true \ | |
--iree-global-opt-propagate-transposes=true \ | |
--iree-opt-aggressively-propagate-transposes=true \ | |
--iree-opt-data-tiling=false \ | |
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \ | |
--iree-hal-indirect-command-buffers=true \ | |
--iree-stream-resource-memory-model=discrete \ | |
--iree-hal-memoization=true \ | |
--iree-opt-strip-assertions | |
iree-run-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
# EXEC @prefill_bs4 | |
# result[0]: hal.buffer_view | |
# 4x2048x128256xbf16=[[-3.90625 -3.67188 ... | |
# benchmark run | |
# 128 prefill | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \ | |
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x64xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x64xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-11T10:26:13-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 3.68, 2.49, 1.34 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 713 ms 713 ms 1 items_per_second=1.40236/s | |
# BM_prefill_bs4/process_time/real_time 713 ms 714 ms 1 items_per_second=1.4018/s | |
# BM_prefill_bs4/process_time/real_time 714 ms 714 ms 1 items_per_second=1.39994/s | |
# BM_prefill_bs4/process_time/real_time_mean 714 ms 714 ms 3 items_per_second=1.40136/s | |
# BM_prefill_bs4/process_time/real_time_median 713 ms 714 ms 3 items_per_second=1.4018/s | |
# BM_prefill_bs4/process_time/real_time_stddev 0.646 ms 0.592 ms 3 items_per_second=1.26808m/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.09 % 0.08 % 3 items_per_second=0.09% | |
# 128 decode | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/128/fp8_128.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64=@/sharedfile/128/decode/decode_next_tokens_4x1xi64.bin \ | |
--input=4xi64=@/sharedfile/128/decode/decode_seq_lens_4xi64.bin \ | |
--input=4xi64=@/sharedfile/128/decode/decode_start_positions_4xi64.bin \ | |
--input=4x5xi64=@/sharedfile/128/decode/decode_seq_block_ids_tensor_4x5xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-11T16:30:42-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 4.84, 11.44, 24.97 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------ | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------ | |
# BM_decode_bs4/process_time/real_time 23.8 ms 24.6 ms 30 items_per_second=42.0138/s | |
# BM_decode_bs4/process_time/real_time 23.9 ms 24.8 ms 30 items_per_second=41.8749/s | |
# BM_decode_bs4/process_time/real_time 24.0 ms 24.6 ms 30 items_per_second=41.7428/s | |
# BM_decode_bs4/process_time/real_time_mean 23.9 ms 24.7 ms 3 items_per_second=41.8772/s | |
# BM_decode_bs4/process_time/real_time_median 23.9 ms 24.6 ms 3 items_per_second=41.8749/s | |
# BM_decode_bs4/process_time/real_time_stddev 0.077 ms 0.148 ms 3 items_per_second=0.135524/s | |
# BM_decode_bs4/process_time/real_time_cv 0.32 % 0.60 % 3 items_per_second=0.32% | |
# 2048 prefill | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/2048/fp8_2048.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=prefill_bs4 \ | |
--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \ | |
--input=4x64xi64=@/sharedfile/2048/prefill/prefill_seq_block_ids_4x64xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-10T18:56:57-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 1.13, 1.23, 3.41 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------- | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------- | |
# BM_prefill_bs4/process_time/real_time 725 ms 725 ms 1 items_per_second=1.37975/s | |
# BM_prefill_bs4/process_time/real_time 727 ms 728 ms 1 items_per_second=1.3762/s | |
# BM_prefill_bs4/process_time/real_time 727 ms 728 ms 1 items_per_second=1.37512/s | |
# BM_prefill_bs4/process_time/real_time_mean 726 ms 727 ms 3 items_per_second=1.37703/s | |
# BM_prefill_bs4/process_time/real_time_median 727 ms 728 ms 3 items_per_second=1.3762/s | |
# BM_prefill_bs4/process_time/real_time_stddev 1.28 ms 1.41 ms 3 items_per_second=2.42255m/s | |
# BM_prefill_bs4/process_time/real_time_cv 0.18 % 0.19 % 3 items_per_second=0.18% | |
# 2048 decode | |
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ | |
iree-benchmark-module \ | |
--hip_use_streams=true \ | |
--module=/sharedfile/2048/fp8_2048.vmfb \ | |
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \ | |
--device=hip://4 \ | |
--function=decode_bs4 \ | |
--input=4x1xi64=@/sharedfile/2048/decode/decode_next_tokens_4x1xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/decode/decode_seq_lens_4xi64.bin \ | |
--input=4xi64=@/sharedfile/2048/decode/decode_start_positions_4xi64.bin \ | |
--input=4x65xi64=@/sharedfile/2048/decode/decode_seq_block_ids_tensor_4x65xi64.bin \ | |
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/2048/decode/decode_cache_state_261x2097152xf8E4M3FNUZ.bin \ | |
--benchmark_repetitions=3 | |
# 2025-02-11T16:27:33-08:00 | |
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module | |
# Run on (96 X 3810.79 MHz CPU s) | |
# CPU Caches: | |
# L1 Data 32 KiB (x96) | |
# L1 Instruction 32 KiB (x96) | |
# L2 Unified 1024 KiB (x96) | |
# L3 Unified 32768 KiB (x16) | |
# Load Average: 13.80, 17.30, 29.52 | |
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. | |
# ------------------------------------------------------------------------------------------------------ | |
# Benchmark Time CPU Iterations UserCounters... | |
# ------------------------------------------------------------------------------------------------------ | |
# BM_decode_bs4/process_time/real_time 206 ms 206 ms 3 items_per_second=4.86574/s | |
# BM_decode_bs4/process_time/real_time 206 ms 207 ms 3 items_per_second=4.84683/s | |
# BM_decode_bs4/process_time/real_time 207 ms 207 ms 3 items_per_second=4.84188/s | |
# BM_decode_bs4/process_time/real_time_mean 206 ms 207 ms 3 items_per_second=4.85148/s | |
# BM_decode_bs4/process_time/real_time_median 206 ms 207 ms 3 items_per_second=4.84683/s | |
# BM_decode_bs4/process_time/real_time_stddev 0.534 ms 0.506 ms 3 items_per_second=0.0125877/s | |
# BM_decode_bs4/process_time/real_time_cv 0.26 % 0.24 % 3 items_per_second=0.26% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment