Skip to content

Instantly share code, notes, and snippets.

@leuc
Last active July 16, 2024 18:28
Show Gist options
  • Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Decode AMD GPU Metrics from SysFS
#!/usr/bin/env python3
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# amdgpu_metrics.py decode amdgpu metrics from sysfs
# Copyright (C) 2021 leuc
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import ctypes
from json import dumps
from enum import IntFlag
COMMON_HEADER_SIZE = 4
class ThrottleStatus(IntFlag):
# linux/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
PPT0 = 1 << 0
PPT1 = 1 << 1
PPT2 = 1 << 2
PPT3 = 1 << 3
SPL = 1 << 4
FPPT = 1 << 5
SPPT = 1 << 6
SPPT_APU = 1 << 7
TDC_GFX = 1 << 16
TDC_SOC = 1 << 17
TDC_MEM = 1 << 18
TDC_VDD = 1 << 19
TDC_CVIP = 1 << 20
EDC_CPU = 1 << 21
EDC_GFX = 1 << 22
APCC = 1 << 23
TEMP_GPU = 1 << 32
TEMP_CORE = 1 << 33
TEMP_MEM = 1 << 34
TEMP_EDGE = 1 << 35
TEMP_HOTSPOT = 1 << 36
TEMP_SOC = 1 << 37
TEMP_VR_GFX = 1 << 38
TEMP_VR_SOC = 1 << 39
TEMP_VR_MEM0 = 1 << 40
TEMP_VR_MEM1 = 1 << 41
TEMP_LIQUID0 = 1 << 42
TEMP_LIQUID1 = 1 << 43
VRHOT0 = 1 << 44
VRHOT1 = 1 << 45
PROCHOT_CPU = 1 << 46
PROCHOT_GFX = 1 << 47
PPM = 1 << 56
FIT = 1 << 57
def active(self):
members = self.__class__.__members__
return (m for m in members if getattr(self, m)._value_ & self.value != 0)
def __iter__(self):
return self.active()
def __str__(self):
return u', '.join(self.active())
class GpuMetrics(ctypes.Structure):
def __new__(cls, buf):
return cls.from_buffer_copy(buf)
def __init__(self, data):
pass
def __iter__(self):
return ((f[0], getattr(self, f[0])) for f in self._fields_)
def __str__(self):
a = [u'{}: {}'.format(f[0], getattr(self, f[0]))
for f in self._fields_]
return u'> {}\n'.format(type(self).__name__) + u'\n'.join(a)
class MetricsTableHeader(GpuMetrics):
_fields_ = [
('structure_size', ctypes.c_uint16),
('format_revision', ctypes.c_uint8),
('content_revision', ctypes.c_uint8),
]
# AMD GPU metrics defined in
# linux/drivers/gpu/drm/amd/include/kgd_pp_interface.h
class GpuMetrics_v1_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint32),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint8),
('pcie_link_speed', ctypes.c_uint8),
]
class GpuMetrics_v1_1(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
]
class GpuMetrics_v1_2(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
]
class GpuMetrics_v1_3(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
('voltage_soc', ctypes.c_uint16),
('voltage_gfx', ctypes.c_uint16),
('voltage_mem', ctypes.c_uint16),
('padding1', ctypes.c_uint8),
# FIXME Doesn't match output on 5.15.0-051500rc7-generic
# with Navi 10 RX 5600
# ('indep_throttle_status', ctypes.c_uint64),
]
class GpuMetrics_v2_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_1(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_2(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('indep_throttle_status', ctypes.c_uint64),
]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('files', nargs='+',
help='Path to gpu_metrics file under /sys')
parser.add_argument('-j', '--json',
help='Format output as JSON', action="store_true")
args = parser.parse_args()
for filename in args.files:
with open(filename, mode='rb') as fh:
header = MetricsTableHeader(fh.read(COMMON_HEADER_SIZE))
assert header.structure_size > 0
buf = fh.read(header.structure_size)
assert len(buf) + COMMON_HEADER_SIZE == header.structure_size
assert fh.read() == b'' # should be empty
if header.format_revision == 1 and header.content_revision == 0:
metrics = GpuMetrics_v1_0(buf)
elif header.format_revision == 1 and header.content_revision == 1:
metrics = GpuMetrics_v1_1(buf)
elif header.format_revision == 1 and header.content_revision == 2:
metrics = GpuMetrics_v1_2(buf)
elif header.format_revision == 1 and header.content_revision == 3:
metrics = GpuMetrics_v1_3(buf)
elif header.format_revision == 2 and header.content_revision == 0:
metrics = GpuMetrics_v2_0(buf)
elif header.format_revision == 2 and header.content_revision == 1:
metrics = GpuMetrics_v2_1(buf)
elif header.format_revision == 2 and header.content_revision == 2:
metrics = GpuMetrics_v2_2(buf)
else:
raise ValueError("Unsupported metrics v{}.{}".format(
header.format_revision, header.content_revision))
ts = ThrottleStatus(metrics.throttle_status)
if args.json:
print(dumps(dict([
("path", filename)] +
list(header) +
list(metrics) +
[('throttle_status_flags', list(ts))
])))
else:
print(filename)
print(header)
print(metrics)
print("throttle_status_flags:", ts)
@leuc
Copy link
Author

leuc commented Oct 30, 2021

Output with Renoir and Navi 10

Kernel: 5.15.0-051500rc7-generic

Device ID: {'device': '0x1636', 'subsystem_device': '0x09f5', 'subsystem_vendor': '0x1028', 'vendor': '0x1002'}
Decoded Device ID: Renoir
Card Model: Advanced Micro Devices, Inc. [AMD/ATI] Renoir (rev c6)
Display Card Model: Renoir

/sys/devices/pci0000:00/0000:00:08.1/0000:07:00.0/gpu_metrics
> MetricsTableHeader
structure_size: 128
format_revision: 2
content_revision: 2
> GpuMetrics_v2_2
temperature_gfx: 5550
temperature_soc: 5775
temperature_core: 5675
temperature_l3: 6275
average_gfx_activity: 5575
average_mm_activity: 5750
system_clock_counter: 1660726951650924100
average_socket_power: 0
average_socket_power: 0
average_cpu_power: 58822
average_soc_power: 22352
average_gfx_power: 11932
average_core_power: 0
average_gfxclk_frequency: 5
average_socclk_frequency: 1681
average_uclk_frequency: 1580
average_fclk_frequency: 65535
average_vclk_frequency: 6
average_dclk_frequency: 756
current_gfxclk: 21
current_socclk: 7
current_uclk: 7
current_fclk: 7
current_vclk: 6
current_dclk: 72
current_coreclk: 574
current_l3clk: 482
throttle_status: 92667903
fan_pwm: 400
padding: 65535
indep_throttle_status: 450359988533068176
ThrottleStatus.
67108864|16777216|8388608|262144|65536|32768|16384|8192|4096
|EDC_GFX|EDC_CPU|PROCHOT_GFX|PROCHOT_CPU|TDC_SOC|TDC_VDD|THM_SOC|THM_GFX|THM_CORE|SPPT_APU|SPPT|FPPT

Device ID: {'device': '0x731f', 'subsystem_device': '0x09f5', 'subsystem_vendor': '0x1028', 'vendor': '0x1002'}
Decoded Device ID: Navi 10 [Radeon RX 5600 OEM/5600 XT / 5700/5700 XT]
Card Model: Advanced Micro Devices, Inc. [AMD/ATI] Navi 10 [Radeon RX 5600 OEM/5600 XT / 5700/5700 XT] (rev c2)
Display Card Model: Navi 10 RX 5600

/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/gpu_metrics
> MetricsTableHeader
structure_size: 120
format_revision: 1
content_revision: 3
> GpuMetrics_v1_3
temperature_edge: 53
temperature_hotspot: 53
temperature_mem: 58
temperature_vrgfx: 0
temperature_vrsoc: 0
temperature_vrmem: 0
average_gfx_activity: 0
average_umc_activity: 0
average_mm_activity: 65535
average_socket_power: 12
energy_accumulator: 6294297477348589567
system_clock_counter: 142426363985407644
average_gfxclk_frequency: 100
average_socclk_frequency: 65535
average_uclk_frequency: 65535
average_vclk0_frequency: 65535
average_dclk0_frequency: 65535
average_vclk1_frequency: 300
average_dclk1_frequency: 506
current_gfxclk: 100
current_socclk: 1266
current_uclk: 1085
current_vclk0: 65535
current_dclk0: 65535
current_vclk1: 0
current_dclk1: 0
throttle_status: 589823
current_fan_speed: 50
pcie_link_width: 65535
pcie_link_speed: 65535
padding: 65535
gfx_activity_acc: 4294967295
mem_activity_acc: 4294967295
temperature_hbm: 65535
firmware_timestamp: 18446744073709551615
voltage_soc: 65535
voltage_gfx: 65535
voltage_mem: 0
padding1: 0
ThrottleStatus.
524288|32768|16384|8192|4096|
EDC_GFX|EDC_CPU|PROCHOT_GFX|PROCHOT_CPU|TDC_SOC|TDC_VDD|THM_SOC|THM_GFX|THM_CORE|SPPT_APU|SPPT|FPPT

@leuc
Copy link
Author

leuc commented Oct 30, 2021

Please help to test and improve this script!

Run it against a supported graphics card and comment here with kernel version and program output.

Supported amdgpu asic families as of kernel 5.15. Minimum kernel version seems to be 5.14.

  • arcturus
  • navi10
  • sienna cichlid
  • aldebaran
  • yellow carp
  • cyan skillfish
  • vangogh
  • renoir

Usage:

wget https://gist.github.com/leuc/e45f4dc64dc1db870e4bad1c436228bb/raw/7ef897f16a5f49656fb752ddba26b8f878a168a0/amdgpu_metrics.py
chmod +x amdgpu_metrics.py
./amdgpu_metrics.py $(find /sys/devices -name gpu_metrics)

Thanks!

@leuc
Copy link
Author

leuc commented Nov 2, 2021

Now with JSON output and proper throttle status flags

$ amdgpu_metrics.py -j $(find /sys/devices -name gpu_metrics) | jq '.throttle_status_flags'
[
  "PPT0",
  "PPT1",
  "PPT2",
  "PPT3",
  "SPL",
  "FPPT",
  "SPPT",
  "SPPT_APU",
  "TDC_GFX",
  "TDC_SOC",
  "TDC_MEM",
  "TDC_CVIP",
  "EDC_CPU",
  "APCC"
]
[
  "PPT0",
  "PPT1",
  "PPT2",
  "PPT3",
  "SPL",
  "FPPT",
  "SPPT",
  "SPPT_APU",
  "TDC_VDD"
]

@dbekatli
Copy link

dbekatli commented Dec 3, 2021

Hey, thanks for the script. I stumbled upon it while searching for a way to see the utilization of the GPU's video decoder/encoder. This is the output with an RX6800(sienna cichlid) on Linux 5.15.5.
`/sys/class/drm/card0/device/gpu_metrics

MetricsTableHeader
structure_size: 120
format_revision: 1
content_revision: 3
GpuMetrics_v1_3
temperature_edge: 47
temperature_hotspot: 51
temperature_mem: 52
temperature_vrgfx: 46
temperature_vrsoc: 47
temperature_vrmem: 0
average_gfx_activity: 56
average_umc_activity: 6
average_mm_activity: 0
average_socket_power: 37
energy_accumulator: 13253972602453491712
system_clock_counter: 18446464746216491851
average_gfxclk_frequency: 989
average_socclk_frequency: 354
average_uclk_frequency: 308
average_vclk0_frequency: 354
average_dclk0_frequency: 308
average_vclk1_frequency: 500
average_dclk1_frequency: 800
current_gfxclk: 1000
current_socclk: 354
current_uclk: 308
current_vclk0: 354
current_dclk0: 308
current_vclk1: 0
current_dclk1: 0
throttle_status: 1049112
current_fan_speed: 160
pcie_link_width: 65535
pcie_link_speed: 65535
padding: 65535
gfx_activity_acc: 4294967295
mem_activity_acc: 4294967295
temperature_hbm: 65535
firmware_timestamp: 18446744073709551615
voltage_soc: 65535
voltage_gfx: 65535
voltage_mem: 0
padding1: 0
throttle_status_flags: PPT3, SPL, TDC_CVIP
`

For some reason average_mm_activity is always at 0 even when mpv is using vaapi. Do you know if that might not be getting read correctly?

@leuc
Copy link
Author

leuc commented Dec 6, 2021

Thank you for testing!

For some reason average_mm_activity is always at 0 even when mpv is using vaapi. Do you know if that might not be getting read correctly?

Let's have a look:

average_mm_activity is set from "VcnActivityPercentage" for sienna cichlid

gpu_metrics->average_mm_activity =
		use_metrics_v2 ? metrics_v2->VcnActivityPercentage : metrics->VcnActivityPercentage;

https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c#L3614

initialized here, with the note that it is indeed a placeholder.

  uint16_t VcnActivityPercentage  ; //place holder, David N. to provide full sequence

https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/pm/inc/smu11_driver_if_sienna_cichlid.h#L1401

It is either UVD or VCN depending on GPU

smu11/vangogh_ppt.c:	gpu_metrics->average_mm_activity = metrics.UvdActivity;
smu11/vangogh_ppt.c:	gpu_metrics->average_mm_activity = metrics.Current.UvdActivity;
smu11/navi10_ppt.c:	gpu_metrics->average_mm_activity = metrics.VcnActivityPercentage;
smu11/navi10_ppt.c:	gpu_metrics->average_mm_activity = metrics.VcnActivityPercentage;
smu11/sienna_cichlid_ppt.c:	gpu_metrics->average_mm_activity = use_metrics_v2 ? metrics_v2->VcnActivityPercentage : metrics->VcnActivityPercentage;
smu11/arcturus_ppt.c:	gpu_metrics->average_mm_activity = metrics.VcnActivityPercentage;
smu13/yellow_carp_ppt.c:	gpu_metrics->average_mm_activity = metrics.UvdActivity;
smu13/aldebaran_ppt.c:	gpu_metrics->average_mm_activity = 0;
smu12/renoir_ppt.c:	gpu_metrics->average_mm_activity = metrics.AverageUvdActivity;

As far as i can tell, the actual reading from the hardware happens in the firmware blobs?

@leuc
Copy link
Author

leuc commented Nov 18, 2022

Output from a Steam Deck

Linux steamdeck 5.13.0-valve21.3-1-neptune #1 SMP PREEMPT Mon, 03 Oct 2022 23:17:36 +0000 x86_64 GNU/Linux
Vendor: AMD (0x1002)
Device: AMD Custom GPU 0405 (vangogh, LLVM 13.0.0, DRM 3.45, 5.13.0-valve21.3-1-neptune) (0x163f)
Version: 22.0.2
Video memory: 1024MB
04:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] VanGogh (rev ae) (prog-if 00 [VGA controller])
	Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0123
	Flags: bus master, fast devsel, latency 0, IRQ 55
	Memory at f8e0000000 (64-bit, prefetchable) [size=256M]
	Memory at f8f0000000 (64-bit, prefetchable) [size=2M]
	I/O ports at 1000 [size=256]
	Memory at 80300000 (32-bit, non-prefetchable) [size=512K]
	Capabilities: [48] Vendor Specific Information: Len=08 <?>
	Capabilities: [50] Power Management version 3
	Capabilities: [64] Express Legacy Endpoint, MSI 00
	Capabilities: [a0] MSI: Enable- Count=1/4 Maskable- 64bit+
	Capabilities: [c0] MSI-X: Enable+ Count=4 Masked-
	Capabilities: [100] Vendor Specific Information: ID=0001 Rev=1 Len=010 <?>
	Capabilities: [270] Secondary PCI Express
	Capabilities: [2b0] Address Translation Service (ATS)
	Capabilities: [2c0] Page Request Interface (PRI)
	Capabilities: [2d0] Process Address Space ID (PASID)
	Capabilities: [410] Physical Layer 16.0 GT/s <?>
	Capabilities: [440] Lane Margining at the Receiver <?>
	Kernel driver in use: amdgpu
	Kernel modules: amdgpu
$ amdgpu_metrics.py /sys/devices/pci0000:00/0000:00:08.1/0000:04:00.0/gpu_metrics
> MetricsTableHeader
structure_size: 128
format_revision: 2
content_revision: 2
> GpuMetrics_v2_2
temperature_gfx: 3375
temperature_soc: 3300
temperature_core: 3275
temperature_l3: 3225
average_gfx_activity: 3350
average_mm_activity: 3225
system_clock_counter: 18446476561671520255
average_socket_power: 0
average_socket_power: 0
average_cpu_power: 18614
average_soc_power: 60124
average_gfx_power: 24820
average_core_power: 3
average_gfxclk_frequency: 2809
average_socclk_frequency: 350
average_uclk_frequency: 565
average_fclk_frequency: 84
average_vclk_frequency: 0
average_dclk_frequency: 0
current_gfxclk: 0
current_socclk: 0
current_uclk: 65535
current_fclk: 65535
current_vclk: 65535
current_dclk: 65535
current_coreclk: 0
current_l3clk: 0
throttle_status: 0
fan_pwm: 0
padding: 0
indep_throttle_status: 193376259668967624
throttle_status_flags:

@dlm21
Copy link

dlm21 commented Dec 19, 2022

I know this is kind of old, but figured I'd drop some info here for the 7900 XTX / Navi 31 (XFX Merc 310 to be specific). I'm searching out answers to why the driver is claiming my card is being throttled (according to Mangohud, and seemingly this script).

	Subsystem: XFX Limited Device 7901
	Flags: bus master, fast devsel, latency 0, IRQ 139, IOMMU group 27
	Memory at 7000000000 (64-bit, prefetchable) [size=32G]
	Memory at 7800000000 (64-bit, prefetchable) [size=256M]
	I/O ports at e000 [size=256]
	Memory at fcc00000 (32-bit, non-prefetchable) [size=1M]
	Expansion ROM at fcd00000 [disabled] [size=128K]
	Capabilities: [48] Vendor Specific Information: Len=08 <?>
	Capabilities: [50] Power Management version 3
	Capabilities: [64] Express Legacy Endpoint, MSI 00
	Capabilities: [a0] MSI: Enable+ Count=1/1 Maskable- 64bit+
	Capabilities: [100] Vendor Specific Information: ID=0001 Rev=1 Len=010 <?>
	Capabilities: [150] Advanced Error Reporting
	Capabilities: [200] Physical Resizable BAR
	Capabilities: [240] Power Budgeting <?>
	Capabilities: [270] Secondary PCI Express
	Capabilities: [2a0] Access Control Services
	Capabilities: [2d0] Process Address Space ID (PASID)
	Capabilities: [320] Latency Tolerance Reporting
	Capabilities: [410] Physical Layer 16.0 GT/s <?>
	Capabilities: [450] Lane Margining at the Receiver <?>
	Kernel driver in use: amdgpu
	Kernel modules: amdgpu
/sys/devices/pci0000:00/0000:00:03.1/0000:2d:00.0/0000:2e:00.0/0000:2f:00.0/gpu_metrics
> MetricsTableHeader
structure_size: 120
format_revision: 1
content_revision: 3
> GpuMetrics_v1_3
temperature_edge: 32
temperature_hotspot: 36
temperature_mem: 52
temperature_vrgfx: 35
temperature_vrsoc: 34
temperature_vrmem: 36
average_gfx_activity: 1
average_umc_activity: 1
average_mm_activity: 0
average_socket_power: 19
energy_accumulator: 17604753092954292224
system_clock_counter: 18446462633092584127
average_gfxclk_frequency: 42
average_socclk_frequency: 25
average_uclk_frequency: 25
average_vclk0_frequency: 25
average_dclk0_frequency: 25
average_vclk1_frequency: 1200
average_dclk1_frequency: 600
current_gfxclk: 96
current_socclk: 512
current_uclk: 512
current_vclk0: 512
current_dclk0: 512
current_vclk1: 2
current_dclk1: 0
throttle_status: 1049133
current_fan_speed: 4
pcie_link_width: 65535
pcie_link_speed: 65535
padding: 65535
gfx_activity_acc: 4294967295
mem_activity_acc: 4294967295
temperature_hbm: 65535
firmware_timestamp: 83038146081587199
voltage_soc: 698
voltage_gfx: 65535
voltage_mem: 0
padding1: 0
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP

And running this script every second and grepping for throttle_status_flags shows some pretty wild results:

throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT0, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP
throttle_status_flags: PPT1, PPT2, PPT3, FPPT, TDC_CVIP

^ This is while the system is sitting idle with temps and power usage wellllll within their limits:

Adapter: PCI adapter
vddgfx:      635.00 mV 
fan1:         557 RPM  (min =    0 RPM, max = 3300 RPM)
edge:         +32.0°C  (crit = +100.0°C, hyst = -273.1°C)
                       (emerg = +105.0°C)
junction:     +36.0°C  (crit = +110.0°C, hyst = -273.1°C)
                       (emerg = +115.0°C)
mem:          +52.0°C  (crit = +108.0°C, hyst = -273.1°C)
                       (emerg = +113.0°C)
PPT:          25.00 W  (cap = 339.00 W)
Adapter: PCI adapter
SVI2_Core:     1.10 V  
SVI2_SoC:      1.08 V  
Tdie:         +46.5°C  (high = +95.0°C)
Tctl:         +46.5°C  
Tccd1:        +44.2°C  
Tccd2:        +44.0°C  
SVI2_P_Core:  13.77 W  
SVI2_P_SoC:   17.83 W  
SVI2_C_Core:  12.52 A  
SVI2_C_SoC:   16.48 A  

From what I understand, the GPU is throttling due to code in the binary blobs, like maybe it's misreading the sensors on the card ? This is a problem b/c sometimes the card runs full speed and is great, but most of the time it's running at like 50% (clock speeds and power draw) while the CPU (5900X) is not stressed at all (no bottleneck that I can see), which is why I suspect a buggy throttling system. The fans on the card are very quit/slow, so it seems like if it really thought it was thermal throttling, that it would spin those faster ?

@leuc
Copy link
Author

leuc commented Jan 5, 2023

@dlm21
thanks for testing!

what kernel version?

check the active power profile in more recent kernels

cat /sys/class/drm/card?/device/pp_power_profile_mode

The profile can also be changed:
https://kernel.org/doc/html/latest/gpu/amdgpu/thermal.html#pp-power-profile-mode

and this GPU monitoring tool is pretty useful too
https://github.com/Ricks-Lab/gpu-utils

@dlm21
Copy link

dlm21 commented Jan 5, 2023

Hi, sorry, I should have provided that bit, kernel is 6.1.2-3-cachyos (the default kernel here uses the BORE scheduler iirc)

The power profile is 0-BOOTUP_DEFAULT, which appears to have the max values the card supports, and looks like:

❯ cat /sys/class/drm/card?/device/pp_power_profile_mode
PROFILE_INDEX(NAME) CLOCK_TYPE(NAME) FPS MinActiveFreqType MinActiveFreq BoosterFreqType BoosterFreq PD_Data_limit_c PD_Data_error_coeff PD_Data_error_rate_coeff
 0 BOOTUP_DEFAULT*:
                    0(       GFXCLK)       0       1       0       4     800 4587520  -65536       0
                    1(         FCLK)       0       3       0       1       0 3276800  -65536   -6553

I've come to the conclusion that the lack of "maximum GO" on the GPU was simply due to specific CPU cores being maxed out, so while stats may have shown "15% CPU load", it was really saying "most of your cores are idle but holy hell one is going to need a medic" . There are some bits of data that seem off, even from the other repo you linked (thanks for that by the way, very good tools!)
I'm still seeing the throttling messages bounce around, but it doesn't appear to actually be throttling the card when forcing it into a high GPU/low CPU load state, so probably just a red herring. Anywho, if there's any other info you want for S&Gs, lemme know. Thanks for the tool ^_^

@leuc
Copy link
Author

leuc commented Jan 5, 2023

I suspect the "full" power profiles for 7900 XTX are simply not in the kernel tree ... yet.

in comparison Navi 10 Radeon RX 5600 power profile with kernel v6.0.9

cat /sys/class/drm/card0/device/pp_power_profile_mode
PROFILE_INDEX(NAME) CLOCK_TYPE(NAME) FPS MinFreqType MinActiveFreqType MinActiveFreq BoosterFreqType BoosterFreq PD_Data_limit_c PD_Data_error_coeff PD_Data_error_rate_coeff
 0 BOOTUP_DEFAULT*:
                    0(       GFXCLK)       0       5       1       0       4     800 4587520  -65536       0
                    1(       SOCCLK)       0       5       1       0       3     800 1310720   -6553       0
                    2(        MEMLK)       0       5       1       0       4     800  327680  -65536       0
 1 3D_FULL_SCREEN :
                    0(       GFXCLK)       0       5       1       0       4     650 3932160   -6553  -65536
                    1(       SOCCLK)       0       5       1     850       4     800 1310720   -6553       0
                    2(        MEMLK)       0       5       4     850       4     800  327680  -65536       0
 2   POWER_SAVING :
                    0(       GFXCLK)       0       5       1       0       3       0 5898240  -65536       0
                    1(       SOCCLK)       0       5       1       0       3       0 1310720   -6553       0
                    2(        MEMLK)       0       5       1       0       3       0 1966080  -65536       0
 3          VIDEO :
                    0(       GFXCLK)       0       5       1       0       4     500 4587520  -65536       0
                    1(       SOCCLK)       0       5       1       0       4     500 1310720   -6553       0
                    2(        MEMLK)       0       5       1       0       4     500 1966080  -65536       0
 4             VR :
                    0(       GFXCLK)       0       5       4    1000       4     800 4587520  -65536       0
                    1(       SOCCLK)       0       5       1       0       4     800  327680  -65536       0
                    2(        MEMLK)       0       5       1       0       4     800  327680  -65536       0
 5        COMPUTE :
                    0(       GFXCLK)       0       5       4    1000       3       0 3932160  -65536  -65536
                    1(       SOCCLK)       0       5       4     850       3       0  327680  -65536  -32768
                    2(        MEMLK)       0       5       4     850       3       0  327680  -65536  -32768
 6         CUSTOM :
                    0(       GFXCLK)       0       5       1       0       4     800 4587520  -65536       0
                    1(       SOCCLK)       0       5       1       0       3     800 1310720   -6553       0
                    2(        MEMLK)       0       5       1       0       4     800  327680  -65536       0

@dlm21
Copy link

dlm21 commented Jan 5, 2023

Sorry, I was trying to save some space and only pasted the one that was in use, here's the full thing:

❯ cat /sys/class/drm/card0/device/pp_power_profile_mode

PROFILE_INDEX(NAME) CLOCK_TYPE(NAME) FPS MinActiveFreqType MinActiveFreq BoosterFreqType BoosterFreq PD_Data_limit_c PD_Data_error_coeff PD_Data_error_rate_coeff
 0 BOOTUP_DEFAULT*:
                    0(       GFXCLK)       0       1       0       4     800 4587520  -65536       0
                    1(         FCLK)       0       3       0       1       0 3276800  -65536   -6553
 1 3D_FULL_SCREEN :
                    0(       GFXCLK)       0       0    1200       4     650 3932160   -3276  -65536
                    1(         FCLK)       0       3       0       3       0 1310720   -6553   -6553
 2   POWER_SAVING :
                    0(       GFXCLK)       0       1       0       3       0 5898240  -65536       0
                    1(         FCLK)       0       1       0       1       0 3407872  -65536   -6553
 3          VIDEO :
                    0(       GFXCLK)       0       1       0       4     500 4587520  -65536       0
                    1(         FCLK)       0       1       0       1       0 3473408  -65536   -6553
 4             VR :
                    0(       GFXCLK)       0       2    1000       1       0 3276800       0       0
                    1(         FCLK)       0       3       0       3       0 1310720   -6553   -6553
 5        COMPUTE :
                    0(       GFXCLK)       0       2    1000       1       0 3932160       0       0
                    1(         FCLK)       0       3       0       3       0 1310720   -6553   -6553
 6         CUSTOM :
                    0(       GFXCLK)       0       1       0       4     800 4587520  -65536       0
                    1(         FCLK)       0       3       0       1       0 3276800  -65536   -6553

Not sure if the missing SOCCLK is expected.
It's "good enough for now", so I'll probably just wait for some more driver updates to come down the pipe and see what shakes out.

@leuc
Copy link
Author

leuc commented Jan 5, 2023

try manually setting the power profile to 3D_FULL_SCREEN

as root

echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
echo "1" > /sys/class/drm/card0/device/pp_power_profile_mode

performance difference might only show with a real benchmark...

@dlm21
Copy link

dlm21 commented Jan 5, 2023

Thanks, I used the only benchmark I have installed atm, ffxiv endwalker, and it seemed to bump the score by a little bit, tho it's mostly CPU bottlenecked. The main thing I noticed is the GPU stayed higher clocked and pulled max watts most of the time rather than dropping off as soon as possible. Still goes back to normal during idle / at desktop, so I'll set this as the default for awhile. Thanks !
I still see the ever-changing "throttle_status_flags", like PPT0, PPT1, PPT2, FPPT, TDC_CVIP, but I'm pretty sure those are red herrings and not really indicative of a real problem, or any actual throttling.

@Umio-Yasuno
Copy link

temperature_hbm, temperature_core, temperature_l3, average_core_power, current_coreclk, current_l3clk, padding are arrays.
Therefore, it looks like the data is misaligned.

> MetricsTableHeader
structure_size: 128
format_revision: 2
content_revision: 2
> GpuMetrics_v2_2
temperature_gfx: 3850
temperature_soc: 3925
temperature_core: 3850
temperature_l3: 3975
average_gfx_activity: 3875
average_mm_activity: 5250
system_clock_counter: 17287498960675
average_socket_power: 0
average_socket_power: 0
average_cpu_power: 62303
average_soc_power: 17492
average_gfx_power: 19098
average_core_power: 0
average_gfxclk_frequency: 11
average_socclk_frequency: 6978
average_uclk_frequency: 1744
average_fclk_frequency: 65535
average_vclk_frequency: 0
average_dclk_frequency: 353
current_gfxclk: 0
current_socclk: 8886
current_uclk: 351
current_fclk: 350
current_vclk: 341
current_dclk: 343
current_coreclk: 400
current_l3clk: 400
throttle_status: 104857599
fan_pwm: 400
padding: 65535
indep_throttle_status: 450359988533068176
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, SPL, FPPT, SPPT, SPPT_APU, TDC_GFX, TDC_SOC, TDC_MEM, TDC_VDD, TDC_CVIP, EDC_CPU
V2_2(
    gpu_metrics_v2_2 {
        common_header: metrics_table_header {
            structure_size: 128,
            format_revision: 2,
            content_revision: 2,
        },
        temperature_gfx: 4050,
        temperature_soc: 3950,
        temperature_core: [
            3875,
            3925,
            3900,
            4000,
            4200,
            4000,
            5225,
            4050,
        ],
        temperature_l3: [
            4125,
            0,
        ],
        average_gfx_activity: 2,
        average_mm_activity: 0,
        system_clock_counter: 82267147835201,
        average_socket_power: 14,
        average_cpu_power: 8300,
        average_soc_power: 2587,
        average_gfx_power: 65535,
        average_core_power: [
            0,
            396,
            0,
            403,
            380,
            339,
            4438,
            471,
        ],
        average_gfxclk_frequency: 401,
        average_socclk_frequency: 401,
        average_uclk_frequency: 65535,
        average_fclk_frequency: 1599,
        average_vclk_frequency: 400,
        average_dclk_frequency: 65535,
        current_gfxclk: 1900,
        current_socclk: 975,
        current_uclk: 6,
        current_fclk: 1600,
        current_vclk: 400,
        current_dclk: 400,
        current_coreclk: [
            0,
            3560,
            0,
            3560,
            3560,
            3560,
            4450,
            3560,
        ],
        current_l3clk: [
            4450,
            0,
        ],
        throttle_status: 0,
        fan_pwm: 0,
        padding: [
            65535,
            65535,
            65535,
        ],
        indep_throttle_status: 0,
    },
)

https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/blob/main/examples/gpu_metrics.rs

@Umio-Yasuno
Copy link

@leuc amdgpu_metrics.py will give partially incorrect results because _pack_ = 1 is not set.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment