Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Last active February 24, 2025 22:40
Show Gist options
  • Save AmosLewis/aae4cd7b564abcc44a0b1ca428d48828 to your computer and use it in GitHub Desktop.
Save AmosLewis/aae4cd7b564abcc44a0b1ca428d48828 to your computer and use it in GitHub Desktop.
iree-base-compiler 3.3.0rc20250215
iree-base-runtime 3.3.0rc20250215
iree-turbine 3.3.0rc20250215
iree-compile /sharedfile/128/fp8_128.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0224_ir0215.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
# /sharedfile/128/fp8_128.mlir:2888:12: error: 'flow.tensor.bitcast' op value set has 1 dynamic dimensions but only 0 dimension values are attached
# %990 = torch.aten.view.dtype %979, %int1_229 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
# ^
# /sharedfile/128/fp8_128.mlir:2888:12: note: see current operation: %1819 = "flow.tensor.bitcast"(%1818) <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (tensor<?x32x8x128xf8E4M3FNUZ>) -> tensor<?x32x8x128xi8>
# /sharedfile/128/fp8_128.mlir:33785:13: error: 'flow.tensor.bitcast' op value set has 1 dynamic dimensions but only 0 dimension values are attached
# %1022 = torch.aten.view.dtype %1004, %int1_182 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,2,32,8,128],si8>
# ^
# /sharedfile/128/fp8_128.mlir:33785:13: note: see current operation: %1803 = "flow.tensor.bitcast"(%1802) <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (tensor<?x32x2x32x8x128xf8E4M3FNUZ>) -> tensor<?x32x2x32x8x128xi8>
iree-base-compiler 3.3.0rc20250214
iree-base-runtime 3.3.0rc20250214
iree-turbine 3.3.0rc20250214
iree-compile /sharedfile/128/fp8_128.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0224_ir0214.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
# /sharedfile/128/fp8_128.mlir:2888:12: error: 'flow.tensor.bitcast' op value set has 1 dynamic dimensions but only 0 dimension values are attached
# %990 = torch.aten.view.dtype %979, %int1_229 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8>
# ^
# /sharedfile/128/fp8_128.mlir:2888:12: note: see current operation: %1819 = "flow.tensor.bitcast"(%1818) <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (tensor<?x32x8x128xf8E4M3FNUZ>) -> tensor<?x32x8x128xi8>
# /sharedfile/128/fp8_128.mlir:33785:13: error: 'flow.tensor.bitcast' op value set has 1 dynamic dimensions but only 0 dimension values are attached
# %1022 = torch.aten.view.dtype %1004, %int1_182 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,2,32,8,128],si8>
# ^
# /sharedfile/128/fp8_128.mlir:33785:13: note: see current operation: %1803 = "flow.tensor.bitcast"(%1802) <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (tensor<?x32x2x32x8x128xf8E4M3FNUZ>) -> tensor<?x32x2x32x8x128xi8>
iree-base-compiler 3.3.0rc20250224
iree-base-runtime 3.3.0rc20250224
iree-turbine 3.3.0rc20250224
iree-compile /sharedfile/128/fp8_128.mlir \
--iree-hip-target=gfx942 \
-o=/sharedfile/128/fp8_128_0224_ir0224.vmfb \
--iree-hal-target-device=hip \
--iree-dispatch-creation-enable-aggressive-fusion=true \
--iree-global-opt-propagate-transposes=true \
--iree-opt-aggressively-propagate-transposes=true \
--iree-opt-data-tiling=false \
--iree-preprocessing-pass-pipeline='builtin.module(util.func(iree-preprocessing-generalize-linalg-matmul-experimental))' \
--iree-hal-indirect-command-buffers=true \
--iree-stream-resource-memory-model=discrete \
--iree-hal-memoization=true \
--iree-opt-strip-assertions
ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
iree-benchmark-module \
--hip_use_streams=true \
--module=/sharedfile/128/fp8_128_0224_ir0224.vmfb \
--parameters=model=/sharedfile/llama3_8b_fp8.irpa \
--device=hip://4 \
--function=prefill_bs4 \
--input=4x128xi64=@/sharedfile/128/prefill/prefill_token_ids_4x128xi64.bin \
--input=4xi64=@/sharedfile/128/prefill/prefill_seq_lens_4xi64.bin \
--input=4x4xi64=@/sharedfile/128/prefill/prefill_seq_block_ids_4x4xi64.bin \
--input=261x2097152xf8E4M3FNUZ=@/sharedfile/128/prefill/prefill_cache_state_261x2097152xf8E4M3FNUZ.bin \
--benchmark_repetitions=3
# 2025-02-24T14:42:08-08:00
# Running /home/chi/src/shark-ai/.venv/lib/python3.11/site-packages/iree/_runtime_libs/iree-benchmark-module
# Run on (96 X 3810.79 MHz CPU s)
# CPU Caches:
# L1 Data 32 KiB (x96)
# L1 Instruction 32 KiB (x96)
# L2 Unified 1024 KiB (x96)
# L3 Unified 32768 KiB (x16)
# Load Average: 9.07, 9.11, 19.93
# ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
# -------------------------------------------------------------------------------------------------------
# Benchmark Time CPU Iterations UserCounters...
# -------------------------------------------------------------------------------------------------------
# BM_prefill_bs4/process_time/real_time 123 ms 124 ms 6 items_per_second=8.12122/s
# BM_prefill_bs4/process_time/real_time 123 ms 124 ms 6 items_per_second=8.11833/s
# BM_prefill_bs4/process_time/real_time 123 ms 124 ms 6 items_per_second=8.11331/s
# BM_prefill_bs4/process_time/real_time_mean 123 ms 124 ms 3 items_per_second=8.11762/s
# BM_prefill_bs4/process_time/real_time_median 123 ms 124 ms 3 items_per_second=8.11833/s
# BM_prefill_bs4/process_time/real_time_stddev 0.061 ms 0.086 ms 3 items_per_second=4.00454m/s
# BM_prefill_bs4/process_time/real_time_cv 0.05 % 0.07 % 3 items_per_second=0.05%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment