Skip to content

Instantly share code, notes, and snippets.

/home/chi/src/iree-build-trace/tools/iree-compile \
/sharedfile/attn/128/fp8_attn.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/attn/128/fp8_attn.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
(.venv) ➜ 32 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
--output-mlir=/sharedfile/32/fp8_attn.mlir \
--output-config=/sharedfile/32/config_attn.json \
--bs=1 --attention-kernel sharktank \
--attention-dtype=bfloat16 \
--activation-dtype=bfloat16 \
--kv-cache-dtype=float8_e4m3fnuz \
--use-hf \
--use-attention-mask
iree-base-compiler 3.3.0rc20250215
iree-base-runtime 3.3.0rc20250215
iree-turbine 3.3.0rc20250215
iree-compile /sharedfile/128/fp8_128.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0224_ir0215.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
iree-base-compiler 3.2.0rc20250209
iree-base-runtime 3.2.0rc20250209
iree-compile /sharedfile/128/fp8_128_0212.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0212_ir0209.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
This file has been truncated, but you can view the full file.
iree-base-compiler 3.3.0rc20250223
iree-base-runtime 3.3.0rc20250223
iree-turbine 3.3.0rc20250223
wget https://sharkpublic.blob.core.windows.net/sharkpublic/chi/llama/fp8_32_kv16.mlir
iree-compile /sharedfile/32/fp8_32_kv16.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/32/fp8_32_kv16.vmfb \
--iree-hal-target-device=hip \
python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
--output-mlir=/sharedfile/32/fp8_32.mlir \
--output-config=/sharedfile/32/config_32.json \
--bs=1 --attention-kernel torch \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 \
--use-hf \
--kv-cache-dtype=float8_e4m3fnuz
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wrapper)
Exporting prefill_bs1
This file has been truncated, but you can view the full file.
#map = affine_map<()[s0] -> (4, s0 * 32)>
#map1 = affine_map<()[s0] -> (4, s0)>
#map2 = affine_map<()[s0] -> (s0, 2097152)>
#map3 = affine_map<()[s0] -> (s0 * 32)>
#map4 = affine_map<()[s0] -> (1, 1, s0 * 32, 131072)>
#map5 = affine_map<()[s0] -> (1, 1, s0 * 32, s0 * 32)>
#map6 = affine_map<()[s0] -> (4, 1, s0 * 32)>
#map7 = affine_map<()[s0] -> (4, 1, 1, s0 * 32)>
#map8 = affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)>
#map9 = affine_map<()[s0] -> (4, s0 * 32, 4096)>
(.venv) ➜ 128 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/attn/fp8_attn.irpa \
--output-mlir=/sharedfile/attn/128/fp8_attn.mlir \
--output-config=/sharedfile/attn/128/config_attn.json \
--bs=4 --attention-kernel sharktank \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wrapper)
Exporting prefill_bs4
attention dtype
torch.float8_e4m3fnuz
/home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
--iree-hip-target=gfx942 \
-o=f8_attn_chi_castf32_roctorch_0213.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
Patch used: https://github.com/nod-ai/shark-ai/pull/896
IREE version==3.2.0rc20250209
instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json
# Generated the input.bin
# Although the run fail, it does generated the input.bin for prefill.(TBD)
# For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
# https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \