zhum · October 21, 2023 00:33
diff --git a/nvidia-smi-q.txt b/nvidia-smi-q.txt
 nvidia-smi --query-gpu=OPTION,... [--format=csv[,hoheader]]

 timestamp                   - YYYY/MM/DD HH:MM:SS.msec
 driver_version
 count
 name or gpu_name
 serial or gpu_serial
 uuid or gpu_uuid
 pci.bus_id or gpu_bus_id    - domain:bus:device.function, in hex
 pci.domain
 pci.bus
 pci.device
 pci.device_id
 pci.sub_device_id
 pcie.link.gen.gpucurrent
 pcie.link.gen.max
 pcie.link.gen.gpumax
 pcie.link.gen.hostmax
 pcie.link.width.current
 pcie.link.width.max
 index                                            - Zero based index of the GPU. Can change at each boot.
 display_mode                                     - "Enabled" - display connected
 display_active                                   - "Enabled" - display is active
 driver_model.current                             - always N/A on linux
 driver_model.pending                             - always N/A on linux
 vbios_version
 inforom.img / inforom.image                      - version
 inforom.oem                                      - version
 inforom.ecc                                      - version
 inforom.pwr / inforom.power                      - version
 temperature.gpu


 persistence_mode                                 - Enabled/Disabled
 addressing_mode                                  - HMM/ATS/None
 accounting.mode                                  - See --help-query-accounted-apps
 accounting.buffer_size
 reset_status.reset_required
 reset_status.drain_and_reset_recommended
 fan.speed                                 - fan speed value is the percent of the product's maximum noise tolerance
 pstate


 vgpu_driver_capability.heterogenous_multivGPU    - 1/0
 vgpu_device_capability.fractional_multiVgpu      - Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
 vgpu_device_capability.heterogeneous_timeSlice_profile - Supports concurrent execution of timesliced vGPU profiles of differing types
 vgpu_device_capability.heterogeneous_timeSlice_sizes   - Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes


 gom.current / gpu_operation_mode.current  - actual GOM
 gom.pending / gpu_operation_mode.pending  - GOM that will be used on the next reboot.


 clocks_event_reasons.supported / clocks_throttle_reasons.supported  - Bitmask of supported clock event reasons. See nvml.h for more details.
 clocks_event_reasons.active / clocks_throttle_reasons.active  - Bitmask of active clock event reasons
 clocks_event_reasons.gpu_idle / clocks_throttle_reasons.gpu_idle
 clocks_event_reasons.applications_clocks_setting / clocks_throttle_reasons.applications_clocks_setting - GPU clocks are limited by applications clocks setting. E.g. can be changed by nvidia-smi --applications-clocks=
 clocks_event_reasons.sw_power_cap / clocks_throttle_reasons.sw_power_cap - SW Power Scaling algorithm is reducing the clocks below requested clocks because the GPU is consuming too much power. E.g. SW power cap limit can be changed with nvidia-smi --power-limit=
 clocks_event_reasons.hw_slowdown / clocks_throttle_reasons.hw_slowdown - HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of:
   HW Thermal Slowdown: temperature being too high
   HW Power Brake Slowdown: External Power Brake Assertion is triggered (e.g. by the system power supply)
   * Power draw is too high and Fast Trigger protection is reducing the clocks
   * May be also reported during PState or clock change
   * This behavior may be removed in a later release
 clocks_event_reasons.hw_thermal_slowdown / clocks_throttle_reasons.hw_thermal_slowdown - HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of temperature being too high
 clocks_event_reasons.hw_power_brake_slowdown / clocks_throttle_reasons.hw_power_brake_slowdown - HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of External Power Brake Assertion being triggered (e.g. by the system power supply)
 clocks_event_reasons.sw_thermal_slowdown / clocks_throttle_reasons.sw_thermal_slowdown - SW Thermal capping algorithm is reducing clocks below requested clocks because GPU temperature is higher than Max Operating Temp.
 clocks_event_reasons.sync_boost / clocks_throttle_reasons.sync_boost - Sync Boost This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
   * order to maximize performance per watt. All GPUs in the sync boost group
   * will boost to the minimum possible clocks across the entire group. Look at
   * the event reasons for other GPUs in the system to see why those GPUs are
   * holding this one at lower clocks.


 memory.total
 memory.reserved
 memory.used
 memory.free
 compute_mode - The compute mode flag indicates whether individual or multiple compute applications may run on the GPU.
    "0: Default" means multiple contexts are allowed per device.
    "1: Exclusive_Thread", deprecated, use Exclusive_Process instead
    "2: Prohibited" means no contexts are allowed per device (no compute apps).
    "3: Exclusive_Process" means only one context is allowed per device, usable from multiple threads at a time.
 compute_cap  - The CUDA Compute Capability, represented as Major DOT Minor.
 utilization.gpu     - %
 utilization.memory  - %
 utilization.encoder - %
 utilization.decoder - %
 utilization.jpeg    - %
 utilization.ofa     - %
 encoder.stats.sessionCount - Number of encoder sessions running on the GPU.
 encoder.stats.averageFps   - Average FPS of all sessions running on the GPU.
 encoder.stats.averageLatency



 ecc.mode.current
 ecc.mode.pending
 ecc.errors.corrected.volatile.device_memory  - Errors detected in global device memory.
 ecc.errors.corrected.volatile.dram           - Errors detected in global device memory.
 ecc.errors.corrected.volatile.register_file  - Errors detected in register file memory.
 ecc.errors.corrected.volatile.l1_cache       - Errors detected in the L1 cache.
 ecc.errors.corrected.volatile.l2_cache       - Errors detected in the L2 cache.
 ecc.errors.corrected.volatile.texture_memory - Parity errors detected in texture memory.
 ecc.errors.corrected.volatile.cbu            - Parity errors detected in CBU.
 ecc.errors.corrected.volatile.sram           - Errors detected in global SRAMs.
 ecc.errors.corrected.volatile.total          - Total errors detected across entire chip.
 ecc.errors.corrected.aggregate.device_memory - Errors detected in global device memory.
 ecc.errors.corrected.aggregate.dram          - Errors detected in global device memory.
 ecc.errors.corrected.aggregate.register_file - Errors detected in register file memory.
 ecc.errors.corrected.aggregate.l1_cache      - Errors detected in the L1 cache.
 ecc.errors.corrected.aggregate.l2_cache      - Errors detected in the L2 cache.
 ecc.errors.corrected.aggregate.texture_memory - Parity errors detected in texture memory.
 ecc.errors.corrected.aggregate.cbu           - Parity errors detected in CBU.
 ecc.errors.corrected.aggregate.sram          - Errors detected in global SRAMs.
 ecc.errors.corrected.aggregate.total         - Total errors detected across entire chip.
 ecc.errors.uncorrected.volatile.device_memory - Errors detected in global device memory.
 ecc.errors.uncorrected.volatile.dram         - Errors detected in global device memory.
 ecc.errors.uncorrected.volatile.register_file - Errors detected in register file memory.
 ecc.errors.uncorrected.volatile.l1_cache     - Errors detected in the L1 cache.
 ecc.errors.uncorrected.volatile.l2_cache     - Errors detected in the L2 cache.
 ecc.errors.uncorrected.volatile.texture_memory - Parity errors detected in texture memory.
 ecc.errors.uncorrected.volatile.cbu          - Parity errors detected in CBU.
 ecc.errors.uncorrected.volatile.sram         - Errors detected in global SRAMs.
 ecc.errors.uncorrected.volatile.total        - Total errors detected across entire chip.
 ecc.errors.uncorrected.aggregate.device_memory - Errors detected in global device memory.
 ecc.errors.uncorrected.aggregate.dram        - Errors detected in global device memory.
 ecc.errors.uncorrected.aggregate.register_file - Errors detected in register file memory.
 ecc.errors.uncorrected.aggregate.l1_cache    - Errors detected in the L1 cache.
 ecc.errors.uncorrected.aggregate.l2_cache    - Errors detected in the L2 cache.
 ecc.errors.uncorrected.aggregate.texture_memory - Parity errors detected in texture memory.
 ecc.errors.uncorrected.aggregate.cbu         - Parity errors detected in CBU.
 ecc.errors.uncorrected.aggregate.sram        - Errors detected in global SRAMs.
 ecc.errors.uncorrected.aggregate.total       - Total errors detected across entire chip.
 retired_pages.single_bit_ecc.count / retired_pages.sbe - The number of GPU device memory pages that have been retired due to multiple single bit ECC errors.
 retired_pages.double_bit.count / retired_pages.dbe - The number of GPU device memory pages that have been retired due to a double bit ECC error.
 retired_pages.pending                        - Checks if any GPU device memory pages are pending retirement on the next reboot. Pages that are pending retirement can still be allocated, and may cause further reliability issues.

 temperature.gpu.tlimit
 temperature.memory
 power.management   - Supported / [Not Supported]
 power.draw         - The last measured power draw for the entire board, in watts
 power.draw.average - The last measured average power draw for the entire board, in watts
 power.draw.instant - The last measured instant power draw for the entire board, in watts
 power.limit        - The software power limit in watts
 enforced.power.limit - The power management algorithm's power ceiling, in watts
 power.default_limit  - The default power management algorithm's power ceiling, in watts
 power.min_limit    - The minimum value in watts that power limit can be set to.
 power.max_limit    - The maximum value in watts that power limit can be set to.
 clocks.current.graphics / clocks.gr - Current frequency of graphics (shader) clock.
 clocks.current.sm / clocks.sm       - Current frequency of SM (Streaming Multiprocessor) clock.
 clocks.current.memory / clocks.mem  - Current frequency of memory clock.
 clocks.current.video / clocks.video - Current frequency of video encoder/decoder clock.
 clocks.applications.graphics / clocks.applications.gr - User specified frequency of graphics (shader) clock.
 clocks.applications.memory / clocks.applications.mem - User specified frequency of memory clock.
 clocks.default_applications.graphics / clocks.default_applications.gr
 clocks.default_applications.memory / clocks.default_applications.mem
 clocks.max.graphics / clocks.max.gr - Maximum frequency of graphics (shader) clock.
 clocks.max.sm / clocks.max.sm
 clocks.max.memory / clocks.max.mem


 mig.mode.current
 mig.mode.pending
 gsp.mode.current
 gsp.mode.default


 protected_memory.total    - Total installed GPU conf compute protected memory.
 protected_memory.used     - Total conf compute protected memory allocated by active contexts.
 protected_memory.free     - Total free conf compute protected memory.
 fabric.state              - Current state of GPU fabric registration process.
 fabric.status             - Error status, valid only if gpu fabric registration state is "completed"
	nvidia-smi --query-gpu=OPTION,... [--format=csv[,hoheader]]

	timestamp - YYYY/MM/DD HH:MM:SS.msec
	driver_version
	count
	name or gpu_name
	serial or gpu_serial
	uuid or gpu_uuid
	pci.bus_id or gpu_bus_id - domain:bus:device.function, in hex
	pci.domain
	pci.bus
	pci.device
	pci.device_id
	pci.sub_device_id
	pcie.link.gen.gpucurrent
	pcie.link.gen.max
	pcie.link.gen.gpumax
	pcie.link.gen.hostmax
	pcie.link.width.current
	pcie.link.width.max
	index - Zero based index of the GPU. Can change at each boot.
	display_mode - "Enabled" - display connected
	display_active - "Enabled" - display is active
	driver_model.current - always N/A on linux
	driver_model.pending - always N/A on linux
	vbios_version
	inforom.img / inforom.image - version
	inforom.oem - version
	inforom.ecc - version
	inforom.pwr / inforom.power - version
	temperature.gpu


	persistence_mode - Enabled/Disabled
	addressing_mode - HMM/ATS/None
	accounting.mode - See --help-query-accounted-apps
	accounting.buffer_size
	reset_status.reset_required
	reset_status.drain_and_reset_recommended
	fan.speed - fan speed value is the percent of the product's maximum noise tolerance
	pstate


	vgpu_driver_capability.heterogenous_multivGPU - 1/0
	vgpu_device_capability.fractional_multiVgpu - Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
	vgpu_device_capability.heterogeneous_timeSlice_profile - Supports concurrent execution of timesliced vGPU profiles of differing types
	vgpu_device_capability.heterogeneous_timeSlice_sizes - Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes


	gom.current / gpu_operation_mode.current - actual GOM
	gom.pending / gpu_operation_mode.pending - GOM that will be used on the next reboot.


	clocks_event_reasons.supported / clocks_throttle_reasons.supported - Bitmask of supported clock event reasons. See nvml.h for more details.
	clocks_event_reasons.active / clocks_throttle_reasons.active - Bitmask of active clock event reasons
	clocks_event_reasons.gpu_idle / clocks_throttle_reasons.gpu_idle
	clocks_event_reasons.applications_clocks_setting / clocks_throttle_reasons.applications_clocks_setting - GPU clocks are limited by applications clocks setting. E.g. can be changed by nvidia-smi --applications-clocks=
	clocks_event_reasons.sw_power_cap / clocks_throttle_reasons.sw_power_cap - SW Power Scaling algorithm is reducing the clocks below requested clocks because the GPU is consuming too much power. E.g. SW power cap limit can be changed with nvidia-smi --power-limit=
	clocks_event_reasons.hw_slowdown / clocks_throttle_reasons.hw_slowdown - HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of:
	HW Thermal Slowdown: temperature being too high
	HW Power Brake Slowdown: External Power Brake Assertion is triggered (e.g. by the system power supply)
	* Power draw is too high and Fast Trigger protection is reducing the clocks
	* May be also reported during PState or clock change
	* This behavior may be removed in a later release
	clocks_event_reasons.hw_thermal_slowdown / clocks_throttle_reasons.hw_thermal_slowdown - HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of temperature being too high
	clocks_event_reasons.hw_power_brake_slowdown / clocks_throttle_reasons.hw_power_brake_slowdown - HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. This is an indicator of External Power Brake Assertion being triggered (e.g. by the system power supply)
	clocks_event_reasons.sw_thermal_slowdown / clocks_throttle_reasons.sw_thermal_slowdown - SW Thermal capping algorithm is reducing clocks below requested clocks because GPU temperature is higher than Max Operating Temp.
	clocks_event_reasons.sync_boost / clocks_throttle_reasons.sync_boost - Sync Boost This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
	* order to maximize performance per watt. All GPUs in the sync boost group
	* will boost to the minimum possible clocks across the entire group. Look at
	* the event reasons for other GPUs in the system to see why those GPUs are
	* holding this one at lower clocks.


	memory.total
	memory.reserved
	memory.used
	memory.free
	compute_mode - The compute mode flag indicates whether individual or multiple compute applications may run on the GPU.
	"0: Default" means multiple contexts are allowed per device.
	"1: Exclusive_Thread", deprecated, use Exclusive_Process instead
	"2: Prohibited" means no contexts are allowed per device (no compute apps).
	"3: Exclusive_Process" means only one context is allowed per device, usable from multiple threads at a time.
	compute_cap - The CUDA Compute Capability, represented as Major DOT Minor.
	utilization.gpu - %
	utilization.memory - %
	utilization.encoder - %
	utilization.decoder - %
	utilization.jpeg - %
	utilization.ofa - %
	encoder.stats.sessionCount - Number of encoder sessions running on the GPU.
	encoder.stats.averageFps - Average FPS of all sessions running on the GPU.
	encoder.stats.averageLatency



	ecc.mode.current
	ecc.mode.pending
	ecc.errors.corrected.volatile.device_memory - Errors detected in global device memory.
	ecc.errors.corrected.volatile.dram - Errors detected in global device memory.
	ecc.errors.corrected.volatile.register_file - Errors detected in register file memory.
	ecc.errors.corrected.volatile.l1_cache - Errors detected in the L1 cache.
	ecc.errors.corrected.volatile.l2_cache - Errors detected in the L2 cache.
	ecc.errors.corrected.volatile.texture_memory - Parity errors detected in texture memory.
	ecc.errors.corrected.volatile.cbu - Parity errors detected in CBU.
	ecc.errors.corrected.volatile.sram - Errors detected in global SRAMs.
	ecc.errors.corrected.volatile.total - Total errors detected across entire chip.
	ecc.errors.corrected.aggregate.device_memory - Errors detected in global device memory.
	ecc.errors.corrected.aggregate.dram - Errors detected in global device memory.
	ecc.errors.corrected.aggregate.register_file - Errors detected in register file memory.
	ecc.errors.corrected.aggregate.l1_cache - Errors detected in the L1 cache.
	ecc.errors.corrected.aggregate.l2_cache - Errors detected in the L2 cache.
	ecc.errors.corrected.aggregate.texture_memory - Parity errors detected in texture memory.
	ecc.errors.corrected.aggregate.cbu - Parity errors detected in CBU.
	ecc.errors.corrected.aggregate.sram - Errors detected in global SRAMs.
	ecc.errors.corrected.aggregate.total - Total errors detected across entire chip.
	ecc.errors.uncorrected.volatile.device_memory - Errors detected in global device memory.
	ecc.errors.uncorrected.volatile.dram - Errors detected in global device memory.
	ecc.errors.uncorrected.volatile.register_file - Errors detected in register file memory.
	ecc.errors.uncorrected.volatile.l1_cache - Errors detected in the L1 cache.
	ecc.errors.uncorrected.volatile.l2_cache - Errors detected in the L2 cache.
	ecc.errors.uncorrected.volatile.texture_memory - Parity errors detected in texture memory.
	ecc.errors.uncorrected.volatile.cbu - Parity errors detected in CBU.
	ecc.errors.uncorrected.volatile.sram - Errors detected in global SRAMs.
	ecc.errors.uncorrected.volatile.total - Total errors detected across entire chip.
	ecc.errors.uncorrected.aggregate.device_memory - Errors detected in global device memory.
	ecc.errors.uncorrected.aggregate.dram - Errors detected in global device memory.
	ecc.errors.uncorrected.aggregate.register_file - Errors detected in register file memory.
	ecc.errors.uncorrected.aggregate.l1_cache - Errors detected in the L1 cache.
	ecc.errors.uncorrected.aggregate.l2_cache - Errors detected in the L2 cache.
	ecc.errors.uncorrected.aggregate.texture_memory - Parity errors detected in texture memory.
	ecc.errors.uncorrected.aggregate.cbu - Parity errors detected in CBU.
	ecc.errors.uncorrected.aggregate.sram - Errors detected in global SRAMs.
	ecc.errors.uncorrected.aggregate.total - Total errors detected across entire chip.
	retired_pages.single_bit_ecc.count / retired_pages.sbe - The number of GPU device memory pages that have been retired due to multiple single bit ECC errors.
	retired_pages.double_bit.count / retired_pages.dbe - The number of GPU device memory pages that have been retired due to a double bit ECC error.
	retired_pages.pending - Checks if any GPU device memory pages are pending retirement on the next reboot. Pages that are pending retirement can still be allocated, and may cause further reliability issues.

	temperature.gpu.tlimit
	temperature.memory
	power.management - Supported / [Not Supported]
	power.draw - The last measured power draw for the entire board, in watts
	power.draw.average - The last measured average power draw for the entire board, in watts
	power.draw.instant - The last measured instant power draw for the entire board, in watts
	power.limit - The software power limit in watts
	enforced.power.limit - The power management algorithm's power ceiling, in watts
	power.default_limit - The default power management algorithm's power ceiling, in watts
	power.min_limit - The minimum value in watts that power limit can be set to.
	power.max_limit - The maximum value in watts that power limit can be set to.
	clocks.current.graphics / clocks.gr - Current frequency of graphics (shader) clock.
	clocks.current.sm / clocks.sm - Current frequency of SM (Streaming Multiprocessor) clock.
	clocks.current.memory / clocks.mem - Current frequency of memory clock.
	clocks.current.video / clocks.video - Current frequency of video encoder/decoder clock.
	clocks.applications.graphics / clocks.applications.gr - User specified frequency of graphics (shader) clock.
	clocks.applications.memory / clocks.applications.mem - User specified frequency of memory clock.
	clocks.default_applications.graphics / clocks.default_applications.gr
	clocks.default_applications.memory / clocks.default_applications.mem
	clocks.max.graphics / clocks.max.gr - Maximum frequency of graphics (shader) clock.
	clocks.max.sm / clocks.max.sm
	clocks.max.memory / clocks.max.mem


	mig.mode.current
	mig.mode.pending
	gsp.mode.current
	gsp.mode.default


	protected_memory.total - Total installed GPU conf compute protected memory.
	protected_memory.used - Total conf compute protected memory allocated by active contexts.
	protected_memory.free - Total free conf compute protected memory.
	fabric.state - Current state of GPU fabric registration process.
	fabric.status - Error status, valid only if gpu fabric registration state is "completed"