Skip to content

Instantly share code, notes, and snippets.

@joltcan
Created April 3, 2026 08:14
Show Gist options
  • Select an option

  • Save joltcan/94e12a182ac7c987ebf6f4c050604f32 to your computer and use it in GitHub Desktop.

Select an option

Save joltcan/94e12a182ac7c987ebf6f4c050604f32 to your computer and use it in GitHub Desktop.
Prometheus Alertmanager rules
groups:
- name: node_exporters
rules:
- alert: HighNetworkTraffic
expr: sum(rate(node_network_receive_bytes_total[1m]) + rate(node_network_transmit_bytes_total[1m])) > 99000000
for: 60m
labels:
severity: warning
annotations:
description: "Total bandwidth is at {{ $value|humanize }}"
- alert: InstanceHighCpu
expr: (100 - cpu_idle_percent{dc!="titco"}) > 80
for: 120m
labels:
severity: warning
annotations:
description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"
- alert: InstanceHighCpuNoACS
expr: (100 - cpu_idle_percent{instance!="somehostwithhighcpuusage"}) > 80
for: 24h
labels:
severity: warning
annotations:
description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"
- alert: InstanceHighMem
expr: mem_used_percent > 90
for: 120m
labels:
severity: warning
annotations:
description: "High memory usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"
- alert: service_down
expr: up == 0 or absent(up)
for: 120m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} service is down.'
- alert: disk_filled_in_X_hours
expr: predict_linear(disk_used_percent{device!="loop.*",device!="overlay"}[2h], 48 * 3600) > 90
for: 2h
labels:
severity: warning
annotations:
description: "Disk {{ $labels.device }} on {{ $labels.instance }} will run out of space in the coming X hours, (currently {{ $value|humanize}}% in use)"
- alert: disk_almost_full
expr: disk_used_percent{device!="loop.*",device!="overlay", host!="somehost"} > 90
for: 1h
labels:
severity: critical
annotations:
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)"
- alert: disk_almost_full_somehist
expr: disk_used_percent{device!="loop.*",device!="overlay",host="somehost"} > 95
for: 6h
labels:
severity: critical
annotations:
description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)"
- name: Web (blackbox_exporter)
rules:
- alert: site_not_reachable
expr: absent(probe_success) or probe_success{job!="blackbox_exporter"} == 0
for: 2h
labels:
severity: warning
annotations:
description: "Blackbox on {{ $labels.job }} can't reach {{ $labels.instance }}"
- alert: cert_about_to_expire
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 15 or time()-x509_cert_expiry < 86400 * 15
for: 240m
labels:
severity: warning
annotations:
description: Cert is about to expire in {{ $value|humanizeDuration }} on {{ $labels.instance }}
- alert: web not success
expr: http_response_result_code{result!="success"}
for: 30m
labels:
severity: warning
annotations:
description: "Looks like {{ $labels.server }} on {{ $labels.host }} is having an error. Please investigate."
- name: Email (needs local config)
rules:
- alert: Exim queue length
expr: absent(exim4_queuelength_value) or exim4_queuelength_value > 10
for: 1h
labels:
severity: warning
annotations:
description: Exim queue counter has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with exim -bp
- alert: Postfix queue length
expr: absent(postfix_queue_length) or postfix_queue_length> 10
for: 1h
labels:
severity: warning
annotations:
description: Postfix queue length has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with mailq, postsuper -d ALL deferred
- name: Custom pushgateway alerts # cron or custom curl to pushgateway which are then stored in prom
rules:
- alert: email_blocked-live_com
expr: check-ip-blocked > 0
labels:
severity: warning
annotations:
description: "Looks like {{ $labels.instance }} and IP {{ $labels.ip }} is blocked in https://sendersupport.olc.protection.outlook.com/snds/ipStatus.aspx"
# cron once per month to make sure it still works
- alert: alertmanager_test
expr: alertmanager_test == 0
labels:
severity: warning
annotations:
description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month"
# cron once per month to reset above
- alert: alertmanager_test_crit
expr: alertmanager_test == 0
labels:
severity: critical
annotations:
description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month"
# used together with https://github.com/joltcan/backup-restic
- alert: Restic
expr: time() - completed{job="restic"} > 3600*24*3
labels:
severity: warning
annotations:
description: restic last succeeded {{humanizeDuration $value}} ago on {{ $labels.instance }}
- name: UPS # I use nut on a linux server
rules:
- alert: ups_offline
expr: network_ups_tools_ups_status{flag="OL"} == 0
labels:
severity: error
annotations:
description: UPS-monitor on {{ $labels.instance }} just went offline.
- alert: ups_charge
expr: network_ups_tools_battery_charge < 30
labels:
severity: warning
annotations:
description: UPS charge on {{ $labels.instance }} is at {{ $value|humanize }}%
- alert: ups_load
expr: network_ups_tools_ups_load > 90
labels:
severity: warning
annotations:
description: UPS load on {{ $labels.instance }} is at {{ $value|humanize }}%
- name: ZFS
rules:
- alert: zfs_pool_used
expr: zfs_pool_PERCENT > 90
for: 6h
labels:
severity: warning
annotations:
summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }}. Is /usr/local/bin/clean_datasets.sh running?
- alert: zfs_pool_full
expr: zfs_pool_PERCENT > 95
labels:
severity: error
annotations:
summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }}
- alert: zfs_pool_health
expr: zfs_pool_HEALTH > 0
labels:
severity: error
annotations:
summary: zfs pool {{ $labels.NAME }} on {{ $labels.instance }} is in a degraded state
# These rules convert from raw prom data to more human readable and are then used above
- name: Converters
rules:
- record: network_ups_tools_ups_power
expr: round(network_ups_tools_ups_load{instance="stor.hemma:9199", job="nut_ups"} * 0.01 * 500 * 0.6)
- record: zfs_pool_PERCENT
expr: "(zfs_pool_SIZE-zfs_pool_FREE)/zfs_pool_SIZE*100"
- record: cpu_idle_percent
expr: (sum(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(irate(node_cpu_seconds_total[5m])) by (instance)) * 100
- name: disk_usage_node_exporter
rules:
- record: disk_used_percent
expr: |
100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))
- name: memory_usage_node_exporter
rules:
- record: mem_used_percent
expr: |
100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment