Created
February 5, 2025 18:05
-
-
Save AmosLewis/5108d77c0720f442334317dc0199518c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/home/chi/src/iree-build/tools/iree-run-module --help | |
# ============================================================================ | |
# 👻 IREE: iree-run-module | |
# ============================================================================ | |
Runs a function within a compiled IREE module and handles I/O parsing | |
and optional expected value verification/output processing. Modules | |
can be provided by file path (`--module=file.vmfb`) or read from stdin | |
(`--module=-`) and the function to execute matches the original name | |
provided to the compiler (`--function=foo` for `func.func @foo`). | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/base/internal/flags.c | |
# ===----------------------------------------------------------------------=== | |
# Displays command line usage information. | |
# --help | |
# Parses a newline-separated list of flags from a file. | |
# Flags are parsed at the point where the flagfile is specified | |
# and following flags may override the parsed values. | |
# NOTE: this --help output is a flagfile! Pipe this to a file, tweak the | |
# options from their defaults, and pass it back in using --flagfile=. | |
# --flagfile=[path] | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/hal/drivers/hip/registration/driver_module.c | |
# ===----------------------------------------------------------------------=== | |
# Path to search for an appropriate libamdhip64.so / amdhip64.dll. If any | |
# paths are provided, then only the given paths are searched. Otherwise, | |
# system heuristics are used to find the dylib. By default, each path is | |
# treated as a directory name, but a distinct file can be given which | |
# must match exactly by prefixing with 'file:'. | |
# --hip_dylib_path=... | |
# Use HIP streams (instead of graphs) for executing command buffers. | |
--hip_use_streams=true | |
# Allow command buffers to execute inline against HIP streams when | |
# possible. | |
--hip_allow_inline_execution=false | |
# Enables HIP asynchronous stream-ordered allocations when supported. | |
--hip_async_allocations=true | |
# Controls the verbosity of tracing when Tracy instrumentation is enabled. | |
# The impact to benchmark timing becomes more severe as the verbosity | |
# increases, and thus should be only enabled when needed. | |
# Permissible values are: | |
# 0 : stream tracing disabled. | |
# 1 : coarse command buffer level tracing enabled. | |
# 2 : fine-grained kernel level tracing enabled. | |
--hip_tracing=2 | |
# Specifies the index of the default HIP device to use | |
--hip_default_index=0 | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/hal/drivers/local_task/registration/driver_module.c | |
# ===----------------------------------------------------------------------=== | |
# Aborts the program on the first failure within a task system queue. | |
--task_abort_on_failure=false | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/hal/drivers/vulkan/registration/driver_module.cc | |
# ===----------------------------------------------------------------------=== | |
# Enables standard Vulkan validation layers. | |
--vulkan_validation_layers=true | |
# Enables VK_EXT_debug_utils, records markers, and logs errors. | |
--vulkan_debug_utils=true | |
# Cutoff for debug output; 0=none, 1=errors, 2=warnings, 3=info, 4=debug. | |
--vulkan_debug_verbosity=2 | |
# Enables Vulkan tracing (if IREE tracing is enabled). | |
--vulkan_tracing=true | |
# Enables the Vulkan 'robustBufferAccess' feature. | |
--vulkan_robust_buffer_access=false | |
# Enables the Vulkan 'sparseBinding' feature (and others) when available. | |
--vulkan_sparse_binding=true | |
# Enables the Vulkan 'sparseResidencyBuffer' feature (and others) when available. | |
--vulkan_sparse_residency=true | |
# Enables the Vulkan 'bufferDeviceAddress` feature and support for SPIR-V executables compiled to use it. | |
--vulkan_buffer_device_addresses=true | |
# Use a dedicated queue with VK_QUEUE_COMPUTE_BIT for dispatch workloads. | |
--vulkan_dedicated_compute_queue=false | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/hal/local/plugins/registration/init.c | |
# ===----------------------------------------------------------------------=== | |
# Load a local HAL executable plugin to resolve imports. | |
# See iree/hal/local/executable_plugin.h for the plugin API. | |
# By default plugins load using the system library loader and accept | |
# native system formats (.dll, .so, .dylib, etc). | |
# For plugins compiled to standalone portable ELF files the embedded ELF | |
# loader can be used even if OS support for dynamic linking is missing or | |
# slow. Prefix the paths with `embedded:` or use the `.sos` extension. | |
# If multiple plugins are specified they will be scanned for imports in | |
# reverse registration order (last plugin checked first). | |
# Examples: | |
# --executable_plugin=some/system.dll | |
# --executable_plugin=some/standalone.sos | |
# --executable_plugin=embedded:some/standalone.so | |
# --executable_plugin=... | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/task/api.c | |
# ===----------------------------------------------------------------------=== | |
# Maximum duration in microseconds each worker should spin waiting for | |
# additional work. In almost all cases this should be 0 as spinning is | |
# often extremely harmful to system health. Only set to non-zero values | |
# when latency is the #1 priority (vs. thermals, system-wide scheduling, | |
# etc). | |
--task_worker_spin_us=0 | |
# Minimum size in bytes of each worker thread stack. | |
# The underlying platform may allocate more stack space but _should_ | |
# guarantee that the available stack space is near this amount. Note that | |
# the task system will take some stack space and not all bytes should be | |
# assumed usable. Note that as much as possible users should not rely on | |
# the stack for storage over ~16-32KB and instead use local workgroup | |
# memory. | |
--task_worker_stack_size=131072 | |
# Overrides the bytes of per-worker local memory allocated for use by | |
# dispatched tiles. Tiles may use less than this but will fail to dispatch | |
# if they require more. Conceptually it is like a stack reservation and | |
# should be treated the same way: the source programs must be built to | |
# only use a specific maximum amount of local memory and the runtime must | |
# be configured to make at least that amount of local memory available. | |
# By default the CPU L2 cache size is used if such queries are supported. | |
--task_worker_local_memory=0 | |
# Available modes: | |
# --task_topology_group_count=non-zero: | |
# Uses whatever the specified group count is and ignores the set mode. | |
# All threads will be unpinned and run on system-determined processors. | |
# --task_topology_cpu_ids=0,1,2 [+ --task_topology_cpu_ids=3,4,5]: | |
# Creates one executor per set of logical CPU IDs. | |
# 'physical_cores': | |
# Creates one executor per NUMA node in --task_topology_nodes= and one | |
# group per physical core in each NUMA node up to the value specified | |
# by --task_topology_max_group_count=. | |
--task_topology_mode="physical_cores" | |
# Defines the total number of task system workers that will be created. | |
# Workers will be distributed across cores. Specifying 0 will use a | |
# heuristic defined by --task_topology_mode= to automatically select the | |
# worker count and distribution. | |
# WARNING: setting this flag directly is not recommended; use | |
# --task_topology_max_group_count= instead. | |
--task_topology_group_count=0 | |
# A list of absolute logical CPU IDs to use for a single topology. One | |
# topology will be created for each repetition of the flag. CPU IDs match | |
# the Linux logical CPU ID scheme (as used by lscpu/lstopo) or a flattened | |
# [0, total_processor_count) range on Windows. | |
# --task_topology_cpu_ids=... | |
# Comma-separated list of NUMA nodes that topologies will be defined for. | |
# Each node specified will be configured based on the other topology | |
# flags. 'all' can be used to indicate all available NUMA nodes and | |
# 'current' will inherit the node of the calling thread. | |
--task_topology_nodes="current" | |
# Sets a maximum value on the worker count that can be automatically | |
# detected and used when --task_topology_group_count=0 and is ignored | |
# otherwise. | |
--task_topology_max_group_count=64 | |
# Selects only cores that match the specified performance level from | |
# [`any`, `low` (or `efficiency`), `high` (or `performance`)]. | |
--task_topology_performance_level="any" | |
# Dumps the flag-specified topology used for creating task executors. | |
# --dump_task_topologies | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/comparison.cc | |
# ===----------------------------------------------------------------------=== | |
# Threshold under which two f16 values are considered equal. | |
--expected_f16_threshold=0.001 | |
# Threshold under which two f32 values are considered equal. | |
--expected_f32_threshold=0.0001 | |
# Threshold under which two f64 values are considered equal. | |
--expected_f64_threshold=0.0001 | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/context_util.c | |
# ===----------------------------------------------------------------------=== | |
# A VM module to load; either a vmfb containing a compiled bytecode module | |
# or a native system library containing a dynamic native module. Modules | |
# are registered in the order defined by the flags with all dependencies | |
# for a module needing to have been registered prior to the dependent | |
# module. HAL modules are added automatically when required. | |
# --module=... | |
# A module I/O mode of ['preload', 'mmap']. | |
# preload: read entire module into wired memory on startup. | |
# mmap: maps the module file into discardable memory - can increase | |
# warm-up time and variance as mapped pages are swapped | |
# by the OS. | |
--module_mode="preload" | |
# Traces VM execution to stderr. | |
--trace_execution=false | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/device_util.c | |
# ===----------------------------------------------------------------------=== | |
# Lists all available HAL drivers compiled into the binary. | |
# --list_drivers | |
# Lists all available HAL devices from all drivers or a specific driver. | |
# Examples: | |
# Show all devices from all drivers: --list_devices | |
# Show all devices from a particular driver: --list_devices=vulkan | |
# --list_devices | |
# Dumps detailed information on all available HAL devices from all drivers | |
# or a specific driver. | |
# Examples: | |
# Show all devices from all drivers: --dump_devices | |
# Show all devices from a particular driver: --dump_devices=vulkan | |
# --dump_devices | |
# Specifies one or more HAL device allocator specs to augment the base | |
# device allocator. See each allocator type for supported configurations. | |
# --device_allocator=... | |
# Specifies one or more HAL devices to use for execution. | |
# Use --list_devices/--dump_devices to see available devices and their | |
# canonical URI used with this flag. | |
# --device=... | |
# HAL device profiling mode (one of ['queue', 'dispatch', 'executable']) | |
# or empty to disable profiling. HAL implementations may require | |
# additional flags in order to configure profiling support on their | |
# devices. | |
--device_profiling_mode="" | |
# Optional file path/prefix for profiling file output. Some | |
# implementations may require a file name in order to capture profiling | |
# information. | |
--device_profiling_file="" | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/instrument_util.c | |
# ===----------------------------------------------------------------------=== | |
# File to populate with instrument data from the program. | |
--instrument_file="" | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/parameter_util.c | |
# ===----------------------------------------------------------------------=== | |
# A parameter I/O mode of ['preload', 'mmap', 'file']. | |
# preload: read entire parameter files into wired memory on startup. | |
# mmap: maps the parameter files into discardable memory - can increase | |
# warm-up time and variance as mapped pages are swapped | |
# by the OS. | |
# file: uses platform file APIs to read/write the file as needed. | |
--parameter_mode="file" | |
# Specifies a parameter file to make available to programs with either an | |
# anonymous global scope (`some_file.gguf`) or a named scope like | |
# `my_scope=some_file.gguf`. | |
# Supported formats: | |
# - .irpa (IREE parameter archive) | |
# - .gguf (https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) | |
# - .safetensors (https://github.com/huggingface/safetensors) | |
# --parameters=... | |
# ===----------------------------------------------------------------------=== | |
# Flags in iree/runtime/src/iree/tooling/run_module.c | |
# ===----------------------------------------------------------------------=== | |
# Name of a function contained in the module specified by --module= to run. | |
--function="" | |
# An input (a) value or (b) buffer of the format: | |
# (a) scalar value | |
# value | |
# e.g.: --input="3.14" | |
# (b) buffer: | |
# [shape]xtype=[value] | |
# e.g.: --input="2x2xi32=1 2 3 4" | |
# Optionally, brackets may be used to separate the element values: | |
# 2x2xi32=[[1 2][3 4]] | |
# Raw binary files can be read to provide buffer contents: | |
# 2x2xi32=@some/file.bin | |
# Numpy npy files from numpy.save can be read to provide 1+ values: | |
# @some.npy | |
# Each occurrence of the flag indicates an input in the order they were | |
# specified on the command line. | |
# --input=... | |
# Specifies how to handle an output from the invocation: | |
# `` (empty): ignore output | |
# e.g.: --output= | |
# `-`: print textual form to stdout | |
# e.g.: --output=- | |
# `@file.npy`: create/overwrite a numpy npy file and write an ndarray | |
# e.g.: [email protected] | |
# `+file.npy`: create/append a numpy npy file and write an ndarray | |
# e.g.: --output=+file.npy | |
# `@file.bin`: create/overwrite a binary file and write value contents | |
# e.g.: [email protected] | |
# `+file.bin`: create/append a binary file and write value contents | |
# e.g.: --output=+file.bin | |
# Numpy npy files can be read in Python using numpy.load, for example an | |
# invocation producing two outputs can be concatenated as: | |
# [email protected] --output=+file.npy | |
# And then loaded in Python by reading from the same file: | |
# with open('file.npy', 'rb') as f: | |
# print(numpy.load(f)) | |
# print(numpy.load(f)) | |
# Primitive values are written as shape=() ndarrays and buffers are | |
# written as i8 arrays with the length of the buffer. | |
# Binary files contain only the contents of the values/buffers provided | |
# without metadata; users must know the shape/type of the output. | |
# Each occurrence of the flag indicates an output in the order they were | |
# specified on the command line. | |
# --output=... | |
# An expected function output following the same format as `--input=`. | |
# When present the results of the invocation will be compared against | |
# these values and the tool will return non-zero if any differ. If the | |
# value of a particular output is not of interest provide `(ignored)`. | |
# --expected_output=... | |
# Prints up to the maximum number of elements of output tensors and elides | |
# the remainder. | |
--output_max_element_count=1024 | |
# Prints runtime statistics to stderr on exit. | |
--print_statistics=false |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment