with nod-ai/shark-ai#896
(.venv) ➜ shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) python3 -m sharktank.examples.export_paged_llm_v1 --irpa-file=/home/chi/src/test/llama/dan/fp8.irpa \
--output-mlir=/home/chi/src/test/llama/dan/fp8_dan.mlir \
--output-config=/home/chi/src/test/llama/dan/config.json \
--bs=1 --attention-kernel torch \
--attention-dtype=float8_e4m3fnuz --activation-dtype=bfloat16
...
GENERATED!
Exporting
Saving to '/home/chi/src/test/llama/dan/fp8_dan.mlir'
(.venv) ➜ shark-ai git:(users/dan-garvey/enable_custom_fp8_matmul) cd ../test/llama/dan
(.venv) ➜ dan ls
config.json f8_.mlir f8_.vmfb fp8_dan.mlir fp8_dan_old.json fp8_dan_old.mlir fp8.irpa indexput.linalg.mlir indexput.torch.mlir
(.venv) ➜ dan /home/chi/src/iree-build/tools/iree-compile fp8_dan.mlir \
--iree-hip-target=gfx942 \
-o=fp8_dan.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
fp8_dan.mlir:1813:12: error: failed to legalize operation 'torch.aten.outer'
%614 = torch.aten.outer %605, %613 : !torch.vtensor<[131072],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
^
fp8_dan.mlir:1813:12: note: see current operation: %1809 = "torch.aten.outer"(%1798, %1808) : (!torch.vtensor<[131072],si64>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
fp8_dan.mlir:22791:12: error: failed to legalize operation 'torch.aten.outer'
%637 = torch.aten.outer %628, %636 : !torch.vtensor<[131072],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
^
fp8_dan.mlir:22791:12: note: see current operation: %1786 = "torch.aten.outer"(%1775, %1785) : (!torch.vtensor<[131072],si64>, !torch.vtensor<[128],f32>) -> !torch.vtensor<[131072,128],f32>
(.venv) ➜ dan pip list
Package Version
------------------------ ---------------
accelerate 1.3.0
aiohappyeyeballs 2.4.4
aiohttp 3.11.11
aiosignal 1.3.2
annotated-types 0.7.0
anyio 4.8.0
attrs 25.1.0
certifi 2025.1.31
cfgv 3.4.0
charset-normalizer 3.4.1
click 8.1.8
dataclasses-json 0.6.7
datasets 3.0.0
diffusers 0.32.2
dill 0.3.8
distlib 0.3.9
einops 0.8.0
execnet 2.1.1
fastapi 0.115.8
filelock 3.17.0
frozenlist 1.5.0
fsspec 2024.6.1
gguf 0.14.0
h11 0.14.0
huggingface-hub 0.28.1
identify 2.6.6
idna 3.10
importlib_metadata 8.6.1
iniconfig 2.0.0
inquirerpy 0.3.4
iree-base-compiler 3.2.0rc20250120
iree-base-runtime 3.2.0rc20250120
iree-turbine 3.2.0rc20250121
Jinja2 3.1.5
MarkupSafe 3.0.2
marshmallow 3.26.0
ml_dtypes 0.5.1
mpmath 1.3.0
multidict 6.1.0
multiprocess 0.70.16
mypy 1.8.0
mypy-extensions 1.0.0
networkx 3.4.2
nodeenv 1.9.1
numpy 2.2.2
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-cusparselt-cu12 0.6.2
nvidia-nccl-cu12 2.21.5
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
onnx 1.17.0
packaging 24.2
pandas 2.2.3
parameterized 0.9.0
pfzy 0.3.4
pillow 11.1.0
pip 22.3.1
platformdirs 4.3.6
pluggy 1.5.0
pre_commit 4.1.0
prompt_toolkit 3.0.50
propcache 0.2.1
protobuf 5.29.3
psutil 6.1.1
pyarrow 19.0.0
pydantic 2.10.6
pydantic_core 2.27.2
pytest 8.0.0
pytest-html 4.1.1
pytest-metadata 3.1.1
pytest-timeout 2.3.1
pytest-xdist 3.5.0
python-dateutil 2.9.0.post0
pytz 2025.1
PyYAML 6.0.2
regex 2024.11.6
requests 2.32.3
safetensors 0.5.2
sentencepiece 0.2.0
setuptools 65.5.0
six 1.17.0
sniffio 1.3.1
starlette 0.45.3
sympy 1.13.1
tokenizers 0.21.0
torch 2.6.0
tqdm 4.67.1
transformers 4.48.0
triton 3.2.0
types-requests 2.31.0.20240125
typing_extensions 4.12.2
typing-inspect 0.9.0
tzdata 2025.1
urllib3 2.3.0
uvicorn 0.34.0
virtualenv 20.29.1
wcwidth 0.2.13
wheel 0.45.1
xxhash 3.5.0
yarl 1.18.3
zipp 3.21.0
git log