Chi_Liu AmosLewis

prefill_bs1

ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
/home/chi/src/iree-build/tools/iree-benchmark-module \
--hip_use_streams=true \
--module=f8_.vmfb \
--parameters=model=fp8.irpa \
--device=hip://4 \
--function=prefill_bs1 \
--input=1x32xi64=@/sharedfile/prefill/prefill_token_ids_1_32.bin \

python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8_attn.irpa \
--output-mlir=/home/chi/src/test/llama/dan/f8_attn_chi.mlir \
--output-config=/home/chi/src/test/llama/dan/config_attn_chi.json \
--bs=1 --attention-kernel sharktank \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16 --use-hf
/home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/turbine/aot/params.py:163: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:203.)
  return torch.from_numpy(wrapper)
Exporting prefill_bs1
Traceback (most recent call last):

with nod-ai/shark-ai#896

(.venv) ➜  shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa \
--output-mlir=/home/chi/src/test/llama/dan/fp8_dan.mlir \
--output-config=/home/chi/src/test/llama/dan/config.json \
--bs=1 --attention-kernel torch \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16
...
GENERATED!

	/home/chi/src/iree-build/tools/iree-run-module --help
	# ============================================================================
	# 👻 IREE: iree-run-module
	# ============================================================================

	Runs a function within a compiled IREE module and handles I/O parsing
	and optional expected value verification/output processing. Modules
	can be provided by file path (`--module=file.vmfb`) or read from stdin
	(`--module=-`) and the function to execute matches the original name
	provided to the compiler (`--function=foo` for `func.func @foo`).

	(.venv) ➜ dan python -m sharktank.evaluate.perplexity_iree \
	--irpa-file=/home/chi/src/test/llama/dan/fp8.irpa \
	--tokenizer-config-json=/home/chi/src/test/llama/dan/tokenizer_config.json \
	--iree-device='hip://4' \
	--iree-hal-target-device=hip \
	--iree-hip-target=gfx942 \
	--attention-kernel decomposed \
	--num-prompts=1

	ModuleNotFoundError: No module named 'tiktoken'

	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
	module @module {
	util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
	util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
	util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
	util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
	util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
	util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
	util.global private @"__auto.blk.0.attn_v.q_input:rscale"

	func.func @function(%arg2: !torch.vtensor<[1,?],si64>, %669:!torch.vtensor<[1,?],si64>, %667: !torch.vtensor<[?,32,8,128],f16>, %674:!torch.vtensor<[?,32,8,128],f8E4M3FNUZ>) -> !torch.vtensor<[?,32,8,128],f16> {
	%false = torch.constant.bool false
	%int1 = torch.constant.int 1
	%670 = torch.aten.size.int %arg2, %int1 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
	%675 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
	%676 = torch.aten.view %669, %675 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
	%677 = torch.prim.ListConstruct %676 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
	%678 = torch.aten.index_put %667, %677, %674, %false : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
	return %678 : !torch.vtensor<[?,32,8,128],f16>
	}

	module {
	func.func @faulty(%arg0: !torch.vtensor<[32,4096],bf16>, %arg1: !torch.vtensor<[4096,4096],bf16>) -> !torch.vtensor<[32,4096],bf16> {
	%int26 = torch.constant.int 26
	%int15 = torch.constant.int 15
	return %arg0 : !torch.vtensor<[32,4096],bf16>
	}
	}

	/home/chi/src/iree-build/tools/iree-run-module \
	--parameter_mode="mmap" \
	--hip_use_streams=true \
	--module=fp8__initializer_0_dispatch_0.vmfb \
	--parameters=model=fp8.irpa \
	--device=hip://4 \
	--function=prefill_bs1 \
	--input=1x32xi64=@/sharedfile/prefill/prefill_token_ids_1_32.bin \
	--input=1xi64=@/sharedfile/prefill/prefill_seq_lens_1.bin \
	--input=1x1xi64=@/sharedfile/prefill/prefill_seq_block_ids_1_1.bin \