Skip to content

Instantly share code, notes, and snippets.

@leuc
Last active April 22, 2025 21:32
Show Gist options
  • Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Decode AMD GPU Metrics from SysFS
#!/usr/bin/env python3
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# amdgpu_metrics.py decode amdgpu metrics from sysfs
# Copyright (C) 2021 leuc
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import ctypes
from json import dumps
from enum import IntFlag
COMMON_HEADER_SIZE = 4
class ThrottleStatus(IntFlag):
# linux/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
PPT0 = 1 << 0
PPT1 = 1 << 1
PPT2 = 1 << 2
PPT3 = 1 << 3
SPL = 1 << 4
FPPT = 1 << 5
SPPT = 1 << 6
SPPT_APU = 1 << 7
TDC_GFX = 1 << 16
TDC_SOC = 1 << 17
TDC_MEM = 1 << 18
TDC_VDD = 1 << 19
TDC_CVIP = 1 << 20
EDC_CPU = 1 << 21
EDC_GFX = 1 << 22
APCC = 1 << 23
TEMP_GPU = 1 << 32
TEMP_CORE = 1 << 33
TEMP_MEM = 1 << 34
TEMP_EDGE = 1 << 35
TEMP_HOTSPOT = 1 << 36
TEMP_SOC = 1 << 37
TEMP_VR_GFX = 1 << 38
TEMP_VR_SOC = 1 << 39
TEMP_VR_MEM0 = 1 << 40
TEMP_VR_MEM1 = 1 << 41
TEMP_LIQUID0 = 1 << 42
TEMP_LIQUID1 = 1 << 43
VRHOT0 = 1 << 44
VRHOT1 = 1 << 45
PROCHOT_CPU = 1 << 46
PROCHOT_GFX = 1 << 47
PPM = 1 << 56
FIT = 1 << 57
def active(self):
members = self.__class__.__members__
return (m for m in members if getattr(self, m)._value_ & self.value != 0)
def __iter__(self):
return self.active()
def __str__(self):
return u', '.join(self.active())
class GpuMetrics(ctypes.Structure):
def __new__(cls, buf):
return cls.from_buffer_copy(buf)
def __init__(self, data):
pass
def __iter__(self):
return ((f[0], getattr(self, f[0])) for f in self._fields_)
def __str__(self):
a = [u'{}: {}'.format(f[0], getattr(self, f[0]))
for f in self._fields_]
return u'> {}\n'.format(type(self).__name__) + u'\n'.join(a)
class MetricsTableHeader(GpuMetrics):
_fields_ = [
('structure_size', ctypes.c_uint16),
('format_revision', ctypes.c_uint8),
('content_revision', ctypes.c_uint8),
]
# AMD GPU metrics defined in
# linux/drivers/gpu/drm/amd/include/kgd_pp_interface.h
class GpuMetrics_v1_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint32),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint8),
('pcie_link_speed', ctypes.c_uint8),
]
class GpuMetrics_v1_1(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
]
class GpuMetrics_v1_2(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
]
class GpuMetrics_v1_3(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
('voltage_soc', ctypes.c_uint16),
('voltage_gfx', ctypes.c_uint16),
('voltage_mem', ctypes.c_uint16),
('padding1', ctypes.c_uint8),
# FIXME Doesn't match output on 5.15.0-051500rc7-generic
# with Navi 10 RX 5600
# ('indep_throttle_status', ctypes.c_uint64),
]
class GpuMetrics_v2_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_1(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_2(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('indep_throttle_status', ctypes.c_uint64),
]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('files', nargs='+',
help='Path to gpu_metrics file under /sys')
parser.add_argument('-j', '--json',
help='Format output as JSON', action="store_true")
args = parser.parse_args()
for filename in args.files:
with open(filename, mode='rb') as fh:
header = MetricsTableHeader(fh.read(COMMON_HEADER_SIZE))
assert header.structure_size > 0
buf = fh.read(header.structure_size)
assert len(buf) + COMMON_HEADER_SIZE == header.structure_size
assert fh.read() == b'' # should be empty
if header.format_revision == 1 and header.content_revision == 0:
metrics = GpuMetrics_v1_0(buf)
elif header.format_revision == 1 and header.content_revision == 1:
metrics = GpuMetrics_v1_1(buf)
elif header.format_revision == 1 and header.content_revision == 2:
metrics = GpuMetrics_v1_2(buf)
elif header.format_revision == 1 and header.content_revision == 3:
metrics = GpuMetrics_v1_3(buf)
elif header.format_revision == 2 and header.content_revision == 0:
metrics = GpuMetrics_v2_0(buf)
elif header.format_revision == 2 and header.content_revision == 1:
metrics = GpuMetrics_v2_1(buf)
elif header.format_revision == 2 and header.content_revision == 2:
metrics = GpuMetrics_v2_2(buf)
else:
raise ValueError("Unsupported metrics v{}.{}".format(
header.format_revision, header.content_revision))
ts = ThrottleStatus(metrics.throttle_status)
if args.json:
print(dumps(dict([
("path", filename)] +
list(header) +
list(metrics) +
[('throttle_status_flags', list(ts))
])))
else:
print(filename)
print(header)
print(metrics)
print("throttle_status_flags:", ts)
@Umio-Yasuno
Copy link

@leuc amdgpu_metrics.py will give partially incorrect results because _pack_ = 1 is not set.

@shmerl
Copy link

shmerl commented Mar 18, 2025

Does AMD document the layout of this metrics file anywhere or the only way to analyze it is to read the driver code?

@Umio-Yasuno
Copy link

@shmerl
The gpu_metrics structure is defined in drivers/gpu/drm/amd/include/kgd_pp_interface.h.
However, the units of the fields may vary depending on the device and firmware.

https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/include/kgd_pp_interface.h

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment