February 19, 2025 20:33 · February 14, 2025 02:27 · February 12, 2025 18:59 · February 12, 2025 22:55 · February 11, 2025 18:35 · February 10, 2025 03:34
 (.venv) ➜  128 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/attn/fp8_attn.irpa \
 --output-mlir=/sharedfile/attn/128/fp8_attn.mlir \
 --output-config=/sharedfile/attn/128/config_attn.json \
 --bs=4 --attention-kernel sharktank \
 --attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf
 /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
  return torch.from_numpy(wrapper)
 Exporting prefill_bs4
 attention dtype
 torch.float8_e4m3fnuz
 /home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
  --iree-hip-target=gfx942 \
  -o=f8_attn_chi_castf32_roctorch_0213.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
 Patch used: https://github.com/nod-ai/shark-ai/pull/896 
 IREE version==3.2.0rc20250209
 instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
 instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json 

 # Generated the input.bin
 # Although the run fail, it does generated the input.bin for prefill.(TBD)
 # For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
 # https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
 python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa  \        
 # Copyright 2024 Advanced Micro Devices, Inc.
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 """Inference support for the PagedLLMV1 protocol of models."""


 import math

 ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
 iree-benchmark-module \
 --hip_use_streams=true \
 --module=/sharedfile/2048/fp8_2048.vmfb \
 --parameters=model=/sharedfile/llama3_8b_fp8.irpa \
 --device=hip://4 \
 --function=prefill_bs4 \
 --input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
 --input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
 (.venv) ➜  shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) ✗ python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
 /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
  return torch.from_numpy(wrapper)
 :: Prompting:
    b't'
 :: Prompt tokens: tensor([[83,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
 :: Invoke prefill:
 failed to translate executable
 diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
 index 5d338bd..ab806b7 100644
 --- a/sharktank/sharktank/examples/paged_llm_v1.py
 +++ b/sharktank/sharktank/examples/paged_llm_v1.py
 @@ -32,7 +32,7 @@ class TorchGenerator:
         self,
         model: PagedLlamaModelV1,
         tokenizer: InferenceTokenizer,
 -        page_cache_size: int = 128,
 +        page_cache_size: int = 261,
 # iree-base-compiler       3.2.0rc20250206
 # iree-base-runtime        3.2.0rc20250206
 # iree-turbine             3.2.0rc20250205

 cd /home/chi/src/shark-ai
 git checkout upstream/users/dan-garvey/enable_custom_fp8_matmul
 wget https://gist.githubusercontent.com/AmosLewis/0775e6286be89476e9f2a4946c634370/raw/bbc3c9bceca30f888d7dd42c37372686cad3efe5/2048.diff
 git apply 2048.diff

 python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
 (.venv) ➜  dan gdb --args /home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
  --iree-hip-target=gfx942 \
  -o=f8_attn_chi_castf32_roctorch.vmfb \
  --iree-hal-target-device=hip \
  --iree-dispatch-creation-enable-aggressive-fusion=true \
  --iree-global-opt-propagate-transposes=true \
  --iree-opt-aggressively-propagate-transposes=true \
  --iree-opt-data-tiling=false \
  --iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
  --iree-hal-indirect-command-buffers=true \
 (.venv) ➜  dan $(git_prompt_info)source /home/dan/SHARK-Platform/.env/bin/activate
 (.env) (.venv) ➜  dan $(git_prompt_info)pip list
 Package                   Version                   Editable project location
 ------------------------- ------------------------- ----------------------------------
 aiohappyeyeballs          2.4.4
 aiohttp                   3.11.11
 aiosignal                 1.3.2
 annotated-types           0.7.0
 anyio                     4.8.0
 argon2-cffi               23.1.0
	(.venv) ➜ 128 python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/sharedfile/attn/fp8_attn.irpa \
	--output-mlir=/sharedfile/attn/128/fp8_attn.mlir \
	--output-config=/sharedfile/attn/128/config_attn.json \
	--bs=4 --attention-kernel sharktank \
	--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-attention-mask --use-hf
	/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
	return torch.from_numpy(wrapper)
	Exporting prefill_bs4
	attention dtype
	torch.float8_e4m3fnuz
	/home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
	--iree-hip-target=gfx942 \
	-o=f8_attn_chi_castf32_roctorch_0213.vmfb \
	--iree-hal-target-device=hip \
	--iree-dispatch-creation-enable-aggressive-fusion=true \
	--iree-global-opt-propagate-transposes=true \
	--iree-opt-aggressively-propagate-transposes=true \
	--iree-opt-data-tiling=false \
	--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
	--iree-hal-indirect-command-buffers=true \
	Patch used: https://github.com/nod-ai/shark-ai/pull/896
	IREE version==3.2.0rc20250209
	instruct weight: /shark-dev/8b/fp8/native_fp8_e4m3fnuz_llama3_8b.irpa
	instruct tokenizer-config-json: /shark-dev/8b/instruct/tokenizer.json

	# Generated the input.bin
	# Although the run fail, it does generated the input.bin for prefill.(TBD)
	# For bs=4, name of prefill_seq_lens_1xi64.bin should be change to prefill_seq_lens_4xi64.bin (TBD)
	# https://gist.github.com/AmosLewis/d2a325a815c106fcf6e964dd249940ba
	python -m sharktank.examples.paged_llm_v1 --irpa-file=/sharedfile/llama3_8b_fp8.irpa \
	# Copyright 2024 Advanced Micro Devices, Inc.
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	"""Inference support for the PagedLLMV1 protocol of models."""


	import math

	ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	iree-benchmark-module \
	--hip_use_streams=true \
	--module=/sharedfile/2048/fp8_2048.vmfb \
	--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
	--device=hip://4 \
	--function=prefill_bs4 \
	--input=4x2048xi64=@/sharedfile/2048/prefill/prefill_token_ids_4x2048xi64.bin \
	--input=4xi64=@/sharedfile/2048/prefill/prefill_seq_lens_4xi64.bin \
	(.venv) ➜ shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) ✗ python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
	/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)
	return torch.from_numpy(wrapper)
	:: Prompting:
	b't'
	:: Prompt tokens: tensor([[83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
	:: Invoke prefill:
	failed to translate executable
	diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
	index 5d338bd..ab806b7 100644
	--- a/sharktank/sharktank/examples/paged_llm_v1.py
	+++ b/sharktank/sharktank/examples/paged_llm_v1.py
	@@ -32,7 +32,7 @@ class TorchGenerator:
	self,
	model: PagedLlamaModelV1,
	tokenizer: InferenceTokenizer,
	- page_cache_size: int = 128,
	+ page_cache_size: int = 261,
	# iree-base-compiler 3.2.0rc20250206
	# iree-base-runtime 3.2.0rc20250206
	# iree-turbine 3.2.0rc20250205

	cd /home/chi/src/shark-ai
	git checkout upstream/users/dan-garvey/enable_custom_fp8_matmul
	wget https://gist.githubusercontent.com/AmosLewis/0775e6286be89476e9f2a4946c634370/raw/bbc3c9bceca30f888d7dd42c37372686cad3efe5/2048.diff
	git apply 2048.diff

	python -m sharktank.examples.paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa --tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer.json --dump-bins "t"
	(.venv) ➜ dan gdb --args /home/chi/src/iree-build/tools/iree-compile f8_attn_chi_castf32_roctorch.mlir \
	--iree-hip-target=gfx942 \
	-o=f8_attn_chi_castf32_roctorch.vmfb \
	--iree-hal-target-device=hip \
	--iree-dispatch-creation-enable-aggressive-fusion=true \
	--iree-global-opt-propagate-transposes=true \
	--iree-opt-aggressively-propagate-transposes=true \
	--iree-opt-data-tiling=false \
	--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
	--iree-hal-indirect-command-buffers=true \
	(.venv) ➜ dan $(git_prompt_info)source /home/dan/SHARK-Platform/.env/bin/activate
	(.env) (.venv) ➜ dan $(git_prompt_info)pip list
	Package Version Editable project location
	------------------------- ------------------------- ----------------------------------
	aiohappyeyeballs 2.4.4
	aiohttp 3.11.11
	aiosignal 1.3.2
	annotated-types 0.7.0
	anyio 4.8.0
	argon2-cffi 23.1.0