Created
May 7, 2024 17:32
-
-
Save bluecmd/4ae181225a25367ce5187672c999e0f0 to your computer and use it in GitHub Desktop.
Some SONIX SONiC Prometheus mods
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
from redis.client import StrictRedis | |
import os | |
import time | |
while True: | |
with open('.sai.prom.new', 'w') as f: | |
r = StrictRedis(db=2, decode_responses=True) | |
appldb = StrictRedis(db=0, decode_responses=True) | |
metrics = collections.defaultdict(lambda: collections.defaultdict(collections.Counter)) | |
intf_oid_dict = r.hgetall('COUNTERS_PORT_NAME_MAP') | |
for intf, oid in list(intf_oid_dict.items()) + list(r.hgetall('COUNTERS_LAG_NAME_MAP').items()): | |
if not intf: | |
continue | |
oids = [oid] | |
if intf.startswith('PortChannel'): | |
# TODO: There seems to be no SAI counters for LAGs, so we emulate them by | |
# summing all the underlying interface counters. This will break if | |
# any of the underlying interfaces reset to zero without all of them | |
# doing the same. But hopefully that never happens. lol | |
oids = [] | |
for member_rec in appldb.keys(f'LAG_MEMBER_TABLE:{intf}:Ethernet*'): | |
member = member_rec.split(':')[2] | |
oids.append(intf_oid_dict[member]) | |
for oid in oids: | |
for cntr, value in r.hgetall('COUNTERS:' + oid).items(): | |
if cntr.startswith('SAI_PORT_STAT_ETHER_IN_PKTS_'): | |
suf = cntr[28:-7].replace('_TO_',':') | |
metrics['SAI_PORT_STAT_ETHER_IN_PKTS_'][suf][intf] += int(value) | |
elif cntr.startswith('SAI_PORT_STAT_ETHER_OUT_PKTS_'): | |
suf = cntr[29:-7].replace('_TO_',':') | |
metrics['SAI_PORT_STAT_ETHER_OUT_PKTS_'][suf][intf] += int(value) | |
elif cntr.startswith('SAI_PORT_STAT_PFC_'): | |
prio = cntr[18] | |
if '_RX_' in cntr: | |
metrics['SAI_PORT_STAT_PFC_RX_PKTS'][prio][intf] += int(value) | |
else: | |
metrics['SAI_PORT_STAT_PFC_TX_PKTS'][prio][intf] += int(value) | |
else: | |
metrics[cntr][0][intf] += int(value) | |
def emit(name, values, **kwargs): | |
for interface, val in values.items(): | |
kwargs['interface'] = interface | |
labels = ','.join('%s="%s"' % (k, str(v)) for k, v in kwargs.items()) | |
print('%s{%s} %s' % (name, labels, val), file=f) | |
simple_metrics = { | |
'SAI_PORT_STAT_ETHER_RX_OVERSIZE_PKTS': 'sai_port_rx_oversize_packets_total', | |
'SAI_PORT_STAT_ETHER_STATS_FRAGMENTS': 'sai_port_fragment_packets_total', | |
'SAI_PORT_STAT_ETHER_STATS_JABBERS': 'sai_port_jabbers_packets_total', | |
'SAI_PORT_STAT_ETHER_STATS_TX_NO_ERRORS': 'sai_port_tx_packets_no_errors_total', | |
'SAI_PORT_STAT_ETHER_STATS_UNDERSIZE_PKTS': 'sai_port_undersized_packets_total', | |
'SAI_PORT_STAT_ETHER_TX_OVERSIZE_PKTS': 'sai_port_tx_oversized_packets_total', | |
'SAI_PORT_STAT_IF_IN_BROADCAST_PKTS': 'sai_port_in_broadcast_packets_total', | |
'SAI_PORT_STAT_IF_IN_DISCARDS': 'sai_port_in_discarded_packets_total', | |
'SAI_PORT_STAT_IF_IN_ERRORS': 'sai_port_in_errored_packets_total', | |
'SAI_PORT_STAT_IF_IN_MULTICAST_PKTS': 'sai_port_in_multicast_packets_total', | |
'SAI_PORT_STAT_IF_IN_NON_UCAST_PKTS': 'sai_port_in_non_unicast_packets_total', | |
'SAI_PORT_STAT_IF_IN_UCAST_PKTS': 'sai_port_in_unicast_packets_total', | |
'SAI_PORT_STAT_IF_IN_UNKNOWN_PROTOS': 'sai_port_in_unknown_protocols_total', | |
'SAI_PORT_STAT_IF_OUT_BROADCAST_PKTS': 'sai_port_out_broadcast_packets_total', | |
'SAI_PORT_STAT_IF_OUT_DISCARDS': 'sai_port_out_discarded_packets_total', | |
'SAI_PORT_STAT_IF_OUT_ERRORS': 'sai_port_out_errored_packets_total', | |
'SAI_PORT_STAT_IF_OUT_MULTICAST_PKTS': 'sai_port_out_multicast_packets_total', | |
'SAI_PORT_STAT_IF_OUT_NON_UCAST_PKTS': 'sai_port_out_non_unicast_packets_total', | |
'SAI_PORT_STAT_IF_OUT_QLEN': 'sai_port_out_queue_length', | |
'SAI_PORT_STAT_IF_OUT_UCAST_PKTS': 'sai_port_out_unicast_packets_total', | |
'SAI_PORT_STAT_IP_IN_RECEIVES': 'sai_port_ip_in_packets_total', | |
'SAI_PORT_STAT_IP_IN_UCAST_PKTS': 'sai_port_ip_in_unicast_packets_total', | |
'SAI_PORT_STAT_PAUSE_RX_PKTS': 'sai_port_rx_pause_frames_total', | |
'SAI_PORT_STAT_PAUSE_TX_PKTS': 'sai_port_tx_pause_frames_total', | |
'SAI_PORT_STAT_OUT_DROPPED_PKTS': 'sai_port_out_dropped_packets_total', | |
'SAI_PORT_STAT_IN_DROPPED_PKTS': 'sai_port_in_dropped_packets_total', | |
'SAI_PORT_STAT_IF_IN_FEC_SYMBOL_ERRORS': 'sai_port_in_fec_symbol_errors_total', | |
'SAI_PORT_STAT_IF_IN_FEC_NOT_CORRECTABLE_FRAMES': 'sai_port_in_fec_not_correctable_frames_total', | |
'SAI_PORT_STAT_IF_IN_FEC_CORRECTABLE_FRAMES': 'sai_port_in_fec_correctable_frames_total', | |
} | |
for metric, values in metrics.items(): | |
print(file=f) | |
if metric in simple_metrics: | |
print('# HELP', simple_metrics[metric], 'SAI metric', metric, file=f) | |
emit(simple_metrics[metric], values[0]) | |
elif metric == 'SAI_PORT_STAT_PFC_RX_PKTS': | |
print('# HELP sai_port_rx_pfc_packets_total SAI metric for SAI_PORT_STAT_PFC_*_RX_PKTS', file=f) | |
for prio, innervals in values.items(): | |
emit('sai_port_rx_pfc_packets_total', innervals, priority=prio) | |
elif metric == 'SAI_PORT_STAT_PFC_TX_PKTS': | |
print('# HELP sai_port_tx_pfc_packets_total SAI metric for SAI_PORT_STAT_PFC_*_TX_PKTS', file=f) | |
for prio, innervals in values.items(): | |
emit('sai_port_tx_pfc_packets_total', innervals, priority=prio) | |
elif metric == 'SAI_PORT_STAT_ETHER_IN_PKTS_': | |
print('# HELP sai_port_in_packet_size_bytes SAI metric for SAI_PORT_STAT_ETHER_IN_PKTS_*_OCTETS', file=f) | |
print('# TYPE sai_port_in_packet_size_bytes histogram', file=f) | |
cntr = collections.Counter() | |
cntr = cntr + values['64'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=64) | |
cntr = cntr + values['65:127'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=127) | |
cntr = cntr + values['128:255'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=255) | |
cntr = cntr + values['256:511'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=511) | |
cntr = cntr + values['512:1023'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=1023) | |
cntr = cntr + values['1024:1518'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=1518) | |
cntr = cntr + values['1519:2047'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=2047) | |
cntr = cntr + values['2048:4095'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=4095) | |
cntr = cntr + values['4096:9216'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le=9216) | |
cntr = cntr + values['9217:16383'] | |
emit('sai_port_in_packet_size_bytes_bucket', cntr, le="+Inf") | |
emit('sai_port_in_packet_size_bytes_count', cntr) | |
elif metric == 'SAI_PORT_STAT_ETHER_OUT_PKTS_': | |
print('# HELP sai_port_out_packet_size_bytes SAI metric for SAI_PORT_STAT_ETHER_OUT_PKTS_*_OCTETS', file=f) | |
print('# TYPE sai_port_out_packet_size_bytes histogram', file=f) | |
cntr = collections.Counter() | |
cntr = cntr + values['64'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=64) | |
cntr = cntr + values['65:127'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=127) | |
cntr = cntr + values['128:255'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=255) | |
cntr = cntr + values['256:511'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=511) | |
cntr = cntr + values['512:1023'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=1023) | |
cntr = cntr + values['1024:1518'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=1518) | |
cntr = cntr + values['1519:2047'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=2047) | |
cntr = cntr + values['2048:4095'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=4095) | |
cntr = cntr + values['4096:9216'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le=9216) | |
cntr = cntr + values['9217:16383'] | |
emit('sai_port_out_packet_size_bytes_bucket', cntr, le="+Inf") | |
emit('sai_port_out_packet_size_bytes_count', cntr) | |
elif metric == 'SAI_PORT_STAT_IF_IN_OCTETS': | |
emit('sai_port_in_packet_size_bytes_sum', values[0]) | |
elif metric == 'SAI_PORT_STAT_IF_OUT_OCTETS': | |
emit('sai_port_out_packet_size_bytes_sum', values[0]) | |
else: | |
print('Unknown metric:', metric) | |
os.rename('.sai.prom.new', 'sai.prom') | |
time.sleep(5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import json | |
import re | |
from natsort import natsorted | |
from utilities_common import db as sonic_db | |
if __name__ == '__main__': | |
sonic = sonic_db.Db() | |
sorted_table_keys = natsorted(sonic.db.keys(sonic.db.APPL_DB, "PORT_TABLE:Ethernet*")) | |
# Try to calculate active SFP lanes inside multi-lane optics by using SONIX proprietary | |
# secret heuristics. | |
# 1) Get all ports, group them by "index" (i.e. the SFP port index). | |
# 2) Order lists by port name (numerically) | |
# 3) Find the index in the list that our port is at | |
# This will surely never ever break ;-) | |
port_to_interfaces = collections.defaultdict(list) | |
for key in sorted_table_keys: | |
index = sonic.db.get(sonic.db.APPL_DB, key, 'index') | |
port_to_interfaces[index].append(key) | |
print('# HELP sfp_model SFP model number') | |
print('# TYPE sfp_model gauge') | |
print('# HELP sfp_eeprom_value SFP metric') | |
print('# TYPE sfp_eeprom_value gauge') | |
print('# HELP sfp_media_lane_count Number of media (e.g. optical) lanes active') | |
print('# TYPE sfp_media_lane_count gauge') | |
for key in sorted_table_keys: | |
interface = re.split(':', key, maxsplit=1)[-1].strip() | |
index = sonic.db.get(sonic.db.APPL_DB, key, 'index') | |
sfp_info = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_INFO|{}'.format(interface)) | |
sfp_dom = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_DOM_SENSOR|{}'.format(interface)) | |
sfp_thres = sonic.db.get_all(sonic.db.STATE_DB, 'TRANSCEIVER_DOM_THRESHOLD|{}'.format(interface)) | |
if sfp_info is not None: | |
spec_raw = sfp_info.get('specification_compliance', '').replace('\'', '"') | |
spec_type = None | |
try: | |
spec = json.loads(spec_raw) | |
spec_type = ( | |
spec.get('Extended Specification Compliance') or | |
spec.get('10/40G Ethernet Compliance Code')) | |
except json.decoder.JSONDecodeError as e: | |
pass | |
print('sfp_model{interface="%s",manufacturer="%s",model="%s",connector="%s",ethernet="%s"} 1.0' % ( | |
interface, sfp_info.get('manufacturer', '').strip(), sfp_info.get('model', '').strip(), | |
sfp_info.get('connector', 'Unknown'), spec_type or 'Unknown')) | |
# CMIS modules publish how many optical lanes they have, or 0 if they are a DAC | |
# Sadly, sometimes SONiC garbles data and believes the modules are CMIS when they are not, so | |
# also check if the module is a QSFP-DD for now | |
media_lane_count = None | |
if sfp_info and sfp_info.get('type', '').startswith('QSFP-DD'): | |
media_lane_count = sfp_info.get('media_lane_count', None) | |
if media_lane_count == "0": | |
# CMIS DAC cable, use the number of host lanes as the media lanes | |
media_lane_count = sfp_info.get('host_lane_count', None) | |
if sfp_dom and media_lane_count is None: | |
# Let's guess! | |
# In order to do this "correctly" we'd have to implement the table 4-6 in SFF-8024 and | |
# keep it up-to-date. That's annoying, so let's just guess for now. | |
media_lane_count = len([x for x in sfp_dom.keys() if x.startswith('rx') and x.endswith('power') and sfp_dom.get(x) != 'N/A']) | |
media_lane_count = float(media_lane_count or 'NaN') | |
print('sfp_media_lane_count{interface="%s"} %.1f' % (interface, media_lane_count)) | |
if sfp_dom is not None: | |
for domkey, unit, chan in ( | |
('temperature', 'C', None), | |
('voltage', 'V', None), | |
('rx1power', 'dBm', 1), | |
('rx2power', 'dBm', 2), | |
('rx3power', 'dBm', 3), | |
('rx4power', 'dBm', 4), | |
('rx5power', 'dBm', 5), | |
('rx6power', 'dBm', 6), | |
('rx7power', 'dBm', 7), | |
('rx8power', 'dBm', 8), | |
('tx1power', 'dBm', 1), | |
('tx2power', 'dBm', 2), | |
('tx3power', 'dBm', 3), | |
('tx4power', 'dBm', 4), | |
('tx5power', 'dBm', 5), | |
('tx6power', 'dBm', 6), | |
('tx7power', 'dBm', 7), | |
('tx8power', 'dBm', 8), | |
('tx1bias', 'mA', 1), | |
('tx2bias', 'mA', 2), | |
('tx3bias', 'mA', 3), | |
('tx4bias', 'mA', 4), | |
('tx5bias', 'mA', 5), | |
('tx6bias', 'mA', 6), | |
('tx7bias', 'mA', 7), | |
('tx8bias', 'mA', 8), | |
# On older SONiC thresholds are in TRANSCEIVER_DOM_SENSOR | |
('temphighalarm', 'C', None), | |
('temphighwarning', 'C', None), | |
('rxpowerhighalarm', 'dBm', None), | |
('rxpowerlowalarm', 'dBm', None), | |
('rxpowerhighwarning', 'dBm', None), | |
('rxpowerlowwarning', 'dBm', None)): | |
v = sfp_dom.get(domkey, None) | |
if v is None or v in ['None', 'N/A', 'Unknown', 'Off']: | |
continue | |
if chan and media_lane_count != float('NaN') and chan > media_lane_count: | |
continue | |
hw_port = port_to_interfaces[index] | |
# If there are multiple Ethernet* on the same SFP, we consider it broken out, so we filter what lanes are active | |
# for some properties, as best we can, which right now is the algorithm described in the top. | |
# If we ever need to support things like breakouts consuming multiple lanes and stuff, this will break | |
is_breakout = len(hw_port) > 1 | |
prop_active = True | |
if is_breakout: | |
our_offset = hw_port.index(key) | |
prop_active = bool(chan and chan == our_offset+1) | |
print('sfp_eeprom_value{interface="%s",property="%s",unit="%s",active="%s"} %f' % ( | |
interface, domkey, unit, 'true' if prop_active else 'false', float(v))) | |
if sfp_thres is not None: | |
for domkey, unit in ( | |
('temphighalarm', 'C'), | |
('temphighwarning', 'C'), | |
('rxpowerhighalarm', 'dBm'), | |
('rxpowerlowalarm', 'dBm'), | |
('rxpowerhighwarning', 'dBm'), | |
('rxpowerlowwarning', 'dBm')): | |
v = sfp_thres.get(domkey, None) | |
if v is None or v == 'None' or v == 'N/A': | |
continue | |
print('sfp_eeprom_value{interface="%s",property="%s",unit="%s",active="true"} %f' % ( | |
interface, domkey, unit, float(v))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment