Skip to content

Instantly share code, notes, and snippets.

@leuc
Last active July 16, 2024 18:28
Show Gist options
  • Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Decode AMD GPU Metrics from SysFS
#!/usr/bin/env python3
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# amdgpu_metrics.py decode amdgpu metrics from sysfs
# Copyright (C) 2021 leuc
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import ctypes
from json import dumps
from enum import IntFlag
COMMON_HEADER_SIZE = 4
class ThrottleStatus(IntFlag):
# linux/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
PPT0 = 1 << 0
PPT1 = 1 << 1
PPT2 = 1 << 2
PPT3 = 1 << 3
SPL = 1 << 4
FPPT = 1 << 5
SPPT = 1 << 6
SPPT_APU = 1 << 7
TDC_GFX = 1 << 16
TDC_SOC = 1 << 17
TDC_MEM = 1 << 18
TDC_VDD = 1 << 19
TDC_CVIP = 1 << 20
EDC_CPU = 1 << 21
EDC_GFX = 1 << 22
APCC = 1 << 23
TEMP_GPU = 1 << 32
TEMP_CORE = 1 << 33
TEMP_MEM = 1 << 34
TEMP_EDGE = 1 << 35
TEMP_HOTSPOT = 1 << 36
TEMP_SOC = 1 << 37
TEMP_VR_GFX = 1 << 38
TEMP_VR_SOC = 1 << 39
TEMP_VR_MEM0 = 1 << 40
TEMP_VR_MEM1 = 1 << 41
TEMP_LIQUID0 = 1 << 42
TEMP_LIQUID1 = 1 << 43
VRHOT0 = 1 << 44
VRHOT1 = 1 << 45
PROCHOT_CPU = 1 << 46
PROCHOT_GFX = 1 << 47
PPM = 1 << 56
FIT = 1 << 57
def active(self):
members = self.__class__.__members__
return (m for m in members if getattr(self, m)._value_ & self.value != 0)
def __iter__(self):
return self.active()
def __str__(self):
return u', '.join(self.active())
class GpuMetrics(ctypes.Structure):
def __new__(cls, buf):
return cls.from_buffer_copy(buf)
def __init__(self, data):
pass
def __iter__(self):
return ((f[0], getattr(self, f[0])) for f in self._fields_)
def __str__(self):
a = [u'{}: {}'.format(f[0], getattr(self, f[0]))
for f in self._fields_]
return u'> {}\n'.format(type(self).__name__) + u'\n'.join(a)
class MetricsTableHeader(GpuMetrics):
_fields_ = [
('structure_size', ctypes.c_uint16),
('format_revision', ctypes.c_uint8),
('content_revision', ctypes.c_uint8),
]
# AMD GPU metrics defined in
# linux/drivers/gpu/drm/amd/include/kgd_pp_interface.h
class GpuMetrics_v1_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint32),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint8),
('pcie_link_speed', ctypes.c_uint8),
]
class GpuMetrics_v1_1(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
]
class GpuMetrics_v1_2(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
]
class GpuMetrics_v1_3(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
('voltage_soc', ctypes.c_uint16),
('voltage_gfx', ctypes.c_uint16),
('voltage_mem', ctypes.c_uint16),
('padding1', ctypes.c_uint8),
# FIXME Doesn't match output on 5.15.0-051500rc7-generic
# with Navi 10 RX 5600
# ('indep_throttle_status', ctypes.c_uint64),
]
class GpuMetrics_v2_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_1(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_2(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('indep_throttle_status', ctypes.c_uint64),
]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('files', nargs='+',
help='Path to gpu_metrics file under /sys')
parser.add_argument('-j', '--json',
help='Format output as JSON', action="store_true")
args = parser.parse_args()
for filename in args.files:
with open(filename, mode='rb') as fh:
header = MetricsTableHeader(fh.read(COMMON_HEADER_SIZE))
assert header.structure_size > 0
buf = fh.read(header.structure_size)
assert len(buf) + COMMON_HEADER_SIZE == header.structure_size
assert fh.read() == b'' # should be empty
if header.format_revision == 1 and header.content_revision == 0:
metrics = GpuMetrics_v1_0(buf)
elif header.format_revision == 1 and header.content_revision == 1:
metrics = GpuMetrics_v1_1(buf)
elif header.format_revision == 1 and header.content_revision == 2:
metrics = GpuMetrics_v1_2(buf)
elif header.format_revision == 1 and header.content_revision == 3:
metrics = GpuMetrics_v1_3(buf)
elif header.format_revision == 2 and header.content_revision == 0:
metrics = GpuMetrics_v2_0(buf)
elif header.format_revision == 2 and header.content_revision == 1:
metrics = GpuMetrics_v2_1(buf)
elif header.format_revision == 2 and header.content_revision == 2:
metrics = GpuMetrics_v2_2(buf)
else:
raise ValueError("Unsupported metrics v{}.{}".format(
header.format_revision, header.content_revision))
ts = ThrottleStatus(metrics.throttle_status)
if args.json:
print(dumps(dict([
("path", filename)] +
list(header) +
list(metrics) +
[('throttle_status_flags', list(ts))
])))
else:
print(filename)
print(header)
print(metrics)
print("throttle_status_flags:", ts)
@dlm21
Copy link

dlm21 commented Jan 5, 2023

Thanks, I used the only benchmark I have installed atm, ffxiv endwalker, and it seemed to bump the score by a little bit, tho it's mostly CPU bottlenecked. The main thing I noticed is the GPU stayed higher clocked and pulled max watts most of the time rather than dropping off as soon as possible. Still goes back to normal during idle / at desktop, so I'll set this as the default for awhile. Thanks !
I still see the ever-changing "throttle_status_flags", like PPT0, PPT1, PPT2, FPPT, TDC_CVIP, but I'm pretty sure those are red herrings and not really indicative of a real problem, or any actual throttling.

@Umio-Yasuno
Copy link

temperature_hbm, temperature_core, temperature_l3, average_core_power, current_coreclk, current_l3clk, padding are arrays.
Therefore, it looks like the data is misaligned.

> MetricsTableHeader
structure_size: 128
format_revision: 2
content_revision: 2
> GpuMetrics_v2_2
temperature_gfx: 3850
temperature_soc: 3925
temperature_core: 3850
temperature_l3: 3975
average_gfx_activity: 3875
average_mm_activity: 5250
system_clock_counter: 17287498960675
average_socket_power: 0
average_socket_power: 0
average_cpu_power: 62303
average_soc_power: 17492
average_gfx_power: 19098
average_core_power: 0
average_gfxclk_frequency: 11
average_socclk_frequency: 6978
average_uclk_frequency: 1744
average_fclk_frequency: 65535
average_vclk_frequency: 0
average_dclk_frequency: 353
current_gfxclk: 0
current_socclk: 8886
current_uclk: 351
current_fclk: 350
current_vclk: 341
current_dclk: 343
current_coreclk: 400
current_l3clk: 400
throttle_status: 104857599
fan_pwm: 400
padding: 65535
indep_throttle_status: 450359988533068176
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, SPL, FPPT, SPPT, SPPT_APU, TDC_GFX, TDC_SOC, TDC_MEM, TDC_VDD, TDC_CVIP, EDC_CPU
V2_2(
    gpu_metrics_v2_2 {
        common_header: metrics_table_header {
            structure_size: 128,
            format_revision: 2,
            content_revision: 2,
        },
        temperature_gfx: 4050,
        temperature_soc: 3950,
        temperature_core: [
            3875,
            3925,
            3900,
            4000,
            4200,
            4000,
            5225,
            4050,
        ],
        temperature_l3: [
            4125,
            0,
        ],
        average_gfx_activity: 2,
        average_mm_activity: 0,
        system_clock_counter: 82267147835201,
        average_socket_power: 14,
        average_cpu_power: 8300,
        average_soc_power: 2587,
        average_gfx_power: 65535,
        average_core_power: [
            0,
            396,
            0,
            403,
            380,
            339,
            4438,
            471,
        ],
        average_gfxclk_frequency: 401,
        average_socclk_frequency: 401,
        average_uclk_frequency: 65535,
        average_fclk_frequency: 1599,
        average_vclk_frequency: 400,
        average_dclk_frequency: 65535,
        current_gfxclk: 1900,
        current_socclk: 975,
        current_uclk: 6,
        current_fclk: 1600,
        current_vclk: 400,
        current_dclk: 400,
        current_coreclk: [
            0,
            3560,
            0,
            3560,
            3560,
            3560,
            4450,
            3560,
        ],
        current_l3clk: [
            4450,
            0,
        ],
        throttle_status: 0,
        fan_pwm: 0,
        padding: [
            65535,
            65535,
            65535,
        ],
        indep_throttle_status: 0,
    },
)

https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/blob/main/examples/gpu_metrics.rs

@Umio-Yasuno
Copy link

@leuc amdgpu_metrics.py will give partially incorrect results because _pack_ = 1 is not set.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment