Skip to content

Instantly share code, notes, and snippets.

(.venv) ➜ 128 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/attn/fp8_attn.irpa \
--output-mlir=/sharedfile/attn/128/fp8_attn.mlir \
--output-config=/sharedfile/attn/128/config_attn.json \
--bs=4 --attention-kernel sharktank \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wrapper)
Exporting prefill_bs4
attention dtype
torch.float8_e4m3fnuz
/home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
--iree-hip-target=gfx942 \
-o=f8_attn_chi_castf32_roctorch_0213.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
Patch used: https://github.com/nod-ai/shark-ai/pull/896
IREE version==3.2.0rc20250209
instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json
# Generated the input.bin
# Although the run fail, it does generated the input.bin for prefill.(TBD)
# For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
# https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Inference support for the PagedLLMV1 protocol of models."""
import math
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/2048/fp8_2048.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
(.venv) ➜ shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) ✗ python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
return torch.from_numpy(wrapper)
:: Prompting:
b't'
:: Prompt tokens: tensor([[83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
:: Invoke prefill:
failed to translate executable
diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
index 5d338bd..ab806b7 100644
--- a/sharktank/sharktank/examples/paged_llm_v1.py
+++ b/sharktank/sharktank/examples/paged_llm_v1.py
@@ -32,7 +32,7 @@ class TorchGenerator:
self,
model: PagedLlamaModelV1,
tokenizer: InferenceTokenizer,
- page_cache_size: int = 128,
+ page_cache_size: int = 261,
# iree-base-compiler 3.2.0rc20250206
# iree-base-runtime 3.2.0rc20250206
# iree-turbine 3.2.0rc20250205
cd /home/chi/src/shark-ai
git checkout upstream/users/dan-garvey/enable_custom_fp8_matmul
wget https://gist.githubusercontent.com/AmosLewis/0775e6286be89476e9f2a4946c634370/raw/bbc3c9bceca30f888d7dd42c37372686cad3efe5/2048.diff
git apply 2048.diff
python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
(.venv) ➜ dan gdb --args /home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
--iree-hip-target=gfx942 \
-o=f8_attn_chi_castf32_roctorch.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
(.venv) ➜ dan $(git_prompt_info)source /home/dan/SHARK-Platform/.env/bin/activate
(.env) (.venv) ➜ dan $(git_prompt_info)pip list
Package Version Editable project location
------------------------- ------------------------- ----------------------------------
aiohappyeyeballs 2.4.4
aiohttp 3.11.11
aiosignal 1.3.2
annotated-types 0.7.0
anyio 4.8.0
argon2-cffi 23.1.0