Skip to content

Instantly share code, notes, and snippets.

@bluecmd
Created May 7, 2024 17:32
Show Gist options
  • Save bluecmd/4ae181225a25367ce5187672c999e0f0 to your computer and use it in GitHub Desktop.
Save bluecmd/4ae181225a25367ce5187672c999e0f0 to your computer and use it in GitHub Desktop.
Some SONIX SONiC Prometheus mods
#!/usr/bin/env python3
import collections
from redis.client import StrictRedis
import os
import time
while True:
with open('.sai.prom.new', 'w') as f:
r = StrictRedis(db=2, decode_responses=True)
appldb = StrictRedis(db=0, decode_responses=True)
metrics = collections.defaultdict(lambda: collections.defaultdict(collections.Counter))
intf_oid_dict = r.hgetall('COUNTERS_PORT_NAME_MAP')
for intf, oid in list(intf_oid_dict.items()) + list(r.hgetall('COUNTERS_LAG_NAME_MAP').items()):
if not intf:
continue
oids = [oid]
if intf.startswith('PortChannel'):
# TODO: There seems to be no SAI counters for LAGs, so we emulate them by
# summing all the underlying interface counters. This will break if
# any of the underlying interfaces reset to zero without all of them
# doing the same. But hopefully that never happens. lol
oids = []
for member_rec in appldb.keys(f'LAG_MEMBER_TABLE:{intf}:Ethernet*'):
member = member_rec.split(':')[2]
oids.append(intf_oid_dict[member])
for oid in oids:
for cntr, value in r.hgetall('COUNTERS:' + oid).items():
if cntr.startswith('SAI_PORT_STAT_ETHER_IN_PKTS_'):
suf = cntr[28:-7].replace('_TO_',':')
metrics['SAI_PORT_STAT_ETHER_IN_PKTS_'][suf][intf] += int(value)
elif cntr.startswith('SAI_PORT_STAT_ETHER_OUT_PKTS_'):
suf = cntr[29:-7].replace('_TO_',':')
metrics['SAI_PORT_STAT_ETHER_OUT_PKTS_'][suf][intf] += int(value)
elif cntr.startswith('SAI_PORT_STAT_PFC_'):
prio = cntr[18]
if '_RX_' in cntr:
metrics['SAI_PORT_STAT_PFC_RX_PKTS'][prio][intf] += int(value)
else:
metrics['SAI_PORT_STAT_PFC_TX_PKTS'][prio][intf] += int(value)
else:
metrics[cntr][0][intf] += int(value)
def emit(name, values, **kwargs):
for interface, val in values.items():
kwargs['interface'] = interface
labels = ','.join('%s="%s"' % (k, str(v)) for k, v in kwargs.items())
print('%s{%s} %s' % (name, labels, val), file=f)
simple_metrics = {
'SAI_PORT_STAT_ETHER_RX_OVERSIZE_PKTS': 'sai_port_rx_oversize_packets_total',
'SAI_PORT_STAT_ETHER_STATS_FRAGMENTS': 'sai_port_fragment_packets_total',
'SAI_PORT_STAT_ETHER_STATS_JABBERS': 'sai_port_jabbers_packets_total',
'SAI_PORT_STAT_ETHER_STATS_TX_NO_ERRORS': 'sai_port_tx_packets_no_errors_total',
'SAI_PORT_STAT_ETHER_STATS_UNDERSIZE_PKTS': 'sai_port_undersized_packets_total',
'SAI_PORT_STAT_ETHER_TX_OVERSIZE_PKTS': 'sai_port_tx_oversized_packets_total',
'SAI_PORT_STAT_IF_IN_BROADCAST_PKTS': 'sai_port_in_broadcast_packets_total',
'SAI_PORT_STAT_IF_IN_DISCARDS': 'sai_port_in_discarded_packets_total',
'SAI_PORT_STAT_IF_IN_ERRORS': 'sai_port_in_errored_packets_total',
'SAI_PORT_STAT_IF_IN_MULTICAST_PKTS': 'sai_port_in_multicast_packets_total',
'SAI_PORT_STAT_IF_IN_NON_UCAST_PKTS': 'sai_port_in_non_unicast_packets_total',
'SAI_PORT_STAT_IF_IN_UCAST_PKTS': 'sai_port_in_unicast_packets_total',
'SAI_PORT_STAT_IF_IN_UNKNOWN_PROTOS': 'sai_port_in_unknown_protocols_total',
'SAI_PORT_STAT_IF_OUT_BROADCAST_PKTS': 'sai_port_out_broadcast_packets_total',
'SAI_PORT_STAT_IF_OUT_DISCARDS': 'sai_port_out_discarded_packets_total',
'SAI_PORT_STAT_IF_OUT_ERRORS': 'sai_port_out_errored_packets_total',
'SAI_PORT_STAT_IF_OUT_MULTICAST_PKTS': 'sai_port_out_multicast_packets_total',
'SAI_PORT_STAT_IF_OUT_NON_UCAST_PKTS': 'sai_port_out_non_unicast_packets_total',
'SAI_PORT_STAT_IF_OUT_QLEN': 'sai_port_out_queue_length',
'SAI_PORT_STAT_IF_OUT_UCAST_PKTS': 'sai_port_out_unicast_packets_total',
'SAI_PORT_STAT_IP_IN_RECEIVES': 'sai_port_ip_in_packets_total',
'SAI_PORT_STAT_IP_IN_UCAST_PKTS': 'sai_port_ip_in_unicast_packets_total',
'SAI_PORT_STAT_PAUSE_RX_PKTS': 'sai_port_rx_pause_frames_total',
'SAI_PORT_STAT_PAUSE_TX_PKTS': 'sai_port_tx_pause_frames_total',
'SAI_PORT_STAT_OUT_DROPPED_PKTS': 'sai_port_out_dropped_packets_total',
'SAI_PORT_STAT_IN_DROPPED_PKTS': 'sai_port_in_dropped_packets_total',
'SAI_PORT_STAT_IF_IN_FEC_SYMBOL_ERRORS': 'sai_port_in_fec_symbol_errors_total',
'SAI_PORT_STAT_IF_IN_FEC_NOT_CORRECTABLE_FRAMES': 'sai_port_in_fec_not_correctable_frames_total',
'SAI_PORT_STAT_IF_IN_FEC_CORRECTABLE_FRAMES': 'sai_port_in_fec_correctable_frames_total',
}
for metric, values in metrics.items():
print(file=f)
if metric in simple_metrics:
print('# HELP', simple_metrics[metric], 'SAI metric', metric, file=f)
emit(simple_metrics[metric], values[0])
elif metric == 'SAI_PORT_STAT_PFC_RX_PKTS':
print('# HELP sai_port_rx_pfc_packets_total SAI metric for SAI_PORT_STAT_PFC_*_RX_PKTS', file=f)
for prio, innervals in values.items():
emit('sai_port_rx_pfc_packets_total', innervals, priority=prio)
elif metric == 'SAI_PORT_STAT_PFC_TX_PKTS':
print('# HELP sai_port_tx_pfc_packets_total SAI metric for SAI_PORT_STAT_PFC_*_TX_PKTS', file=f)
for prio, innervals in values.items():
emit('sai_port_tx_pfc_packets_total', innervals, priority=prio)
elif metric == 'SAI_PORT_STAT_ETHER_IN_PKTS_':
print('# HELP sai_port_in_packet_size_bytes SAI metric for SAI_PORT_STAT_ETHER_IN_PKTS_*_OCTETS', file=f)
print('# TYPE sai_port_in_packet_size_bytes histogram', file=f)
cntr = collections.Counter()
cntr = cntr + values['64']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=64)
cntr = cntr + values['65:127']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=127)
cntr = cntr + values['128:255']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=255)
cntr = cntr + values['256:511']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=511)
cntr = cntr + values['512:1023']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=1023)
cntr = cntr + values['1024:1518']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=1518)
cntr = cntr + values['1519:2047']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=2047)
cntr = cntr + values['2048:4095']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=4095)
cntr = cntr + values['4096:9216']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=9216)
cntr = cntr + values['9217:16383']
emit('sai_port_in_packet_size_bytes_bucket', cntr, le="+Inf")
emit('sai_port_in_packet_size_bytes_count', cntr)
elif metric == 'SAI_PORT_STAT_ETHER_OUT_PKTS_':
print('# HELP sai_port_out_packet_size_bytes SAI metric for SAI_PORT_STAT_ETHER_OUT_PKTS_*_OCTETS', file=f)
print('# TYPE sai_port_out_packet_size_bytes histogram', file=f)
cntr = collections.Counter()
cntr = cntr + values['64']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=64)
cntr = cntr + values['65:127']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=127)
cntr = cntr + values['128:255']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=255)
cntr = cntr + values['256:511']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=511)
cntr = cntr + values['512:1023']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=1023)
cntr = cntr + values['1024:1518']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=1518)
cntr = cntr + values['1519:2047']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=2047)
cntr = cntr + values['2048:4095']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=4095)
cntr = cntr + values['4096:9216']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=9216)
cntr = cntr + values['9217:16383']
emit('sai_port_out_packet_size_bytes_bucket', cntr, le="+Inf")
emit('sai_port_out_packet_size_bytes_count', cntr)
elif metric == 'SAI_PORT_STAT_IF_IN_OCTETS':
emit('sai_port_in_packet_size_bytes_sum', values[0])
elif metric == 'SAI_PORT_STAT_IF_OUT_OCTETS':
emit('sai_port_out_packet_size_bytes_sum', values[0])
else:
print('Unknown metric:', metric)
os.rename('.sai.prom.new', 'sai.prom')
time.sleep(5)
#!/usr/bin/env python3
import collections
import json
import re
from natsort import natsorted
from utilities_common import db as sonic_db
if __name__ == '__main__':
sonic = sonic_db.Db()
sorted_table_keys = natsorted(sonic.db.keys(sonic.db.APPL_DB, "PORT_TABLE:Ethernet*"))
# Try to calculate active SFP lanes inside multi-lane optics by using SONIX proprietary
# secret heuristics.
# 1) Get all ports, group them by "index" (i.e. the SFP port index).
# 2) Order lists by port name (numerically)
# 3) Find the index in the list that our port is at
# This will surely never ever break ;-)
port_to_interfaces = collections.defaultdict(list)
for key in sorted_table_keys:
index = sonic.db.get(sonic.db.APPL_DB, key, 'index')
port_to_interfaces[index].append(key)
print('# HELP sfp_model SFP model number')
print('# TYPE sfp_model gauge')
print('# HELP sfp_eeprom_value SFP metric')
print('# TYPE sfp_eeprom_value gauge')
print('# HELP sfp_media_lane_count Number of media (e.g. optical) lanes active')
print('# TYPE sfp_media_lane_count gauge')
for key in sorted_table_keys:
interface = re.split(':', key, maxsplit=1)[-1].strip()
index = sonic.db.get(sonic.db.APPL_DB, key, 'index')
sfp_info = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_INFO|{}'.format(interface))
sfp_dom = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_DOM_SENSOR|{}'.format(interface))
sfp_thres = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_DOM_THRESHOLD|{}'.format(interface))
if sfp_info is not None:
spec_raw = sfp_info.get('specification_compliance', '').replace('\'', '"')
spec_type = None
try:
spec = json.loads(spec_raw)
spec_type = (
spec.get('Extended Specification Compliance') or
spec.get('10/40G Ethernet Compliance Code'))
except json.decoder.JSONDecodeError as e:
pass
print('sfp_model{interface="%s",manufacturer="%s",model="%s",connector="%s",ethernet="%s"} 1.0' % (
interface, sfp_info.get('manufacturer', '').strip(), sfp_info.get('model', '').strip(),
sfp_info.get('connector', 'Unknown'), spec_type or 'Unknown'))
# CMIS modules publish how many optical lanes they have, or 0 if they are a DAC
# Sadly, sometimes SONiC garbles data and believes the modules are CMIS when they are not, so
# also check if the module is a QSFP-DD for now
media_lane_count = None
if sfp_info and sfp_info.get('type', '').startswith('QSFP-DD'):
media_lane_count = sfp_info.get('media_lane_count', None)
if media_lane_count == "0":
# CMIS DAC cable, use the number of host lanes as the media lanes
media_lane_count = sfp_info.get('host_lane_count', None)
if sfp_dom and media_lane_count is None:
# Let's guess!
# In order to do this "correctly" we'd have to implement the table 4-6 in SFF-8024 and
# keep it up-to-date. That's annoying, so let's just guess for now.
media_lane_count = len([x for x in sfp_dom.keys() if x.startswith('rx') and x.endswith('power') and sfp_dom.get(x) != 'N/A'])
media_lane_count = float(media_lane_count or 'NaN')
print('sfp_media_lane_count{interface="%s"} %.1f' % (interface, media_lane_count))
if sfp_dom is not None:
for domkey, unit, chan in (
('temperature', 'C', None),
('voltage', 'V', None),
('rx1power', 'dBm', 1),
('rx2power', 'dBm', 2),
('rx3power', 'dBm', 3),
('rx4power', 'dBm', 4),
('rx5power', 'dBm', 5),
('rx6power', 'dBm', 6),
('rx7power', 'dBm', 7),
('rx8power', 'dBm', 8),
('tx1power', 'dBm', 1),
('tx2power', 'dBm', 2),
('tx3power', 'dBm', 3),
('tx4power', 'dBm', 4),
('tx5power', 'dBm', 5),
('tx6power', 'dBm', 6),
('tx7power', 'dBm', 7),
('tx8power', 'dBm', 8),
('tx1bias', 'mA', 1),
('tx2bias', 'mA', 2),
('tx3bias', 'mA', 3),
('tx4bias', 'mA', 4),
('tx5bias', 'mA', 5),
('tx6bias', 'mA', 6),
('tx7bias', 'mA', 7),
('tx8bias', 'mA', 8),
# On older SONiC thresholds are in TRANSCEIVER_DOM_SENSOR
('temphighalarm', 'C', None),
('temphighwarning', 'C', None),
('rxpowerhighalarm', 'dBm', None),
('rxpowerlowalarm', 'dBm', None),
('rxpowerhighwarning', 'dBm', None),
('rxpowerlowwarning', 'dBm', None)):
v = sfp_dom.get(domkey, None)
if v is None or v in ['None', 'N/A', 'Unknown', 'Off']:
continue
if chan and media_lane_count != float('NaN') and chan > media_lane_count:
continue
hw_port = port_to_interfaces[index]
# If there are multiple Ethernet* on the same SFP, we consider it broken out, so we filter what lanes are active
# for some properties, as best we can, which right now is the algorithm described in the top.
# If we ever need to support things like breakouts consuming multiple lanes and stuff, this will break
is_breakout = len(hw_port) > 1
prop_active = True
if is_breakout:
our_offset = hw_port.index(key)
prop_active = bool(chan and chan == our_offset+1)
print('sfp_eeprom_value{interface="%s",property="%s",unit="%s",active="%s"} %f' % (
interface, domkey, unit, 'true' if prop_active else 'false', float(v)))
if sfp_thres is not None:
for domkey, unit in (
('temphighalarm', 'C'),
('temphighwarning', 'C'),
('rxpowerhighalarm', 'dBm'),
('rxpowerlowalarm', 'dBm'),
('rxpowerhighwarning', 'dBm'),
('rxpowerlowwarning', 'dBm')):
v = sfp_thres.get(domkey, None)
if v is None or v == 'None' or v == 'N/A':
continue
print('sfp_eeprom_value{interface="%s",property="%s",unit="%s",active="true"} %f' % (
interface, domkey, unit, float(v)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment