Created
April 3, 2026 08:14
-
-
Save joltcan/94e12a182ac7c987ebf6f4c050604f32 to your computer and use it in GitHub Desktop.
Prometheus Alertmanager rules
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| groups: | |
| - name: node_exporters | |
| rules: | |
| - alert: HighNetworkTraffic | |
| expr: sum(rate(node_network_receive_bytes_total[1m]) + rate(node_network_transmit_bytes_total[1m])) > 99000000 | |
| for: 60m | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "Total bandwidth is at {{ $value|humanize }}" | |
| - alert: InstanceHighCpu | |
| expr: (100 - cpu_idle_percent{dc!="titco"}) > 80 | |
| for: 120m | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)" | |
| - alert: InstanceHighCpuNoACS | |
| expr: (100 - cpu_idle_percent{instance!="somehostwithhighcpuusage"}) > 80 | |
| for: 24h | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)" | |
| - alert: InstanceHighMem | |
| expr: mem_used_percent > 90 | |
| for: 120m | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "High memory usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)" | |
| - alert: service_down | |
| expr: up == 0 or absent(up) | |
| for: 120m | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: '{{ $labels.instance }} service is down.' | |
| - alert: disk_filled_in_X_hours | |
| expr: predict_linear(disk_used_percent{device!="loop.*",device!="overlay"}[2h], 48 * 3600) > 90 | |
| for: 2h | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "Disk {{ $labels.device }} on {{ $labels.instance }} will run out of space in the coming X hours, (currently {{ $value|humanize}}% in use)" | |
| - alert: disk_almost_full | |
| expr: disk_used_percent{device!="loop.*",device!="overlay", host!="somehost"} > 90 | |
| for: 1h | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)" | |
| - alert: disk_almost_full_somehist | |
| expr: disk_used_percent{device!="loop.*",device!="overlay",host="somehost"} > 95 | |
| for: 6h | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)" | |
| - name: Web (blackbox_exporter) | |
| rules: | |
| - alert: site_not_reachable | |
| expr: absent(probe_success) or probe_success{job!="blackbox_exporter"} == 0 | |
| for: 2h | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "Blackbox on {{ $labels.job }} can't reach {{ $labels.instance }}" | |
| - alert: cert_about_to_expire | |
| expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 15 or time()-x509_cert_expiry < 86400 * 15 | |
| for: 240m | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: Cert is about to expire in {{ $value|humanizeDuration }} on {{ $labels.instance }} | |
| - alert: web not success | |
| expr: http_response_result_code{result!="success"} | |
| for: 30m | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "Looks like {{ $labels.server }} on {{ $labels.host }} is having an error. Please investigate." | |
| - name: Email (needs local config) | |
| rules: | |
| - alert: Exim queue length | |
| expr: absent(exim4_queuelength_value) or exim4_queuelength_value > 10 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: Exim queue counter has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with exim -bp | |
| - alert: Postfix queue length | |
| expr: absent(postfix_queue_length) or postfix_queue_length> 10 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: Postfix queue length has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with mailq, postsuper -d ALL deferred | |
| - name: Custom pushgateway alerts # cron or custom curl to pushgateway which are then stored in prom | |
| rules: | |
| - alert: email_blocked-live_com | |
| expr: check-ip-blocked > 0 | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "Looks like {{ $labels.instance }} and IP {{ $labels.ip }} is blocked in https://sendersupport.olc.protection.outlook.com/snds/ipStatus.aspx" | |
| # cron once per month to make sure it still works | |
| - alert: alertmanager_test | |
| expr: alertmanager_test == 0 | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month" | |
| # cron once per month to reset above | |
| - alert: alertmanager_test_crit | |
| expr: alertmanager_test == 0 | |
| labels: | |
| severity: critical | |
| annotations: | |
| description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month" | |
| # used together with https://github.com/joltcan/backup-restic | |
| - alert: Restic | |
| expr: time() - completed{job="restic"} > 3600*24*3 | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: restic last succeeded {{humanizeDuration $value}} ago on {{ $labels.instance }} | |
| - name: UPS # I use nut on a linux server | |
| rules: | |
| - alert: ups_offline | |
| expr: network_ups_tools_ups_status{flag="OL"} == 0 | |
| labels: | |
| severity: error | |
| annotations: | |
| description: UPS-monitor on {{ $labels.instance }} just went offline. | |
| - alert: ups_charge | |
| expr: network_ups_tools_battery_charge < 30 | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: UPS charge on {{ $labels.instance }} is at {{ $value|humanize }}% | |
| - alert: ups_load | |
| expr: network_ups_tools_ups_load > 90 | |
| labels: | |
| severity: warning | |
| annotations: | |
| description: UPS load on {{ $labels.instance }} is at {{ $value|humanize }}% | |
| - name: ZFS | |
| rules: | |
| - alert: zfs_pool_used | |
| expr: zfs_pool_PERCENT > 90 | |
| for: 6h | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }}. Is /usr/local/bin/clean_datasets.sh running? | |
| - alert: zfs_pool_full | |
| expr: zfs_pool_PERCENT > 95 | |
| labels: | |
| severity: error | |
| annotations: | |
| summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }} | |
| - alert: zfs_pool_health | |
| expr: zfs_pool_HEALTH > 0 | |
| labels: | |
| severity: error | |
| annotations: | |
| summary: zfs pool {{ $labels.NAME }} on {{ $labels.instance }} is in a degraded state | |
| # These rules convert from raw prom data to more human readable and are then used above | |
| - name: Converters | |
| rules: | |
| - record: network_ups_tools_ups_power | |
| expr: round(network_ups_tools_ups_load{instance="stor.hemma:9199", job="nut_ups"} * 0.01 * 500 * 0.6) | |
| - record: zfs_pool_PERCENT | |
| expr: "(zfs_pool_SIZE-zfs_pool_FREE)/zfs_pool_SIZE*100" | |
| - record: cpu_idle_percent | |
| expr: (sum(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(irate(node_cpu_seconds_total[5m])) by (instance)) * 100 | |
| - name: disk_usage_node_exporter | |
| rules: | |
| - record: disk_used_percent | |
| expr: | | |
| 100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) | |
| - name: memory_usage_node_exporter | |
| rules: | |
| - record: mem_used_percent | |
| expr: | | |
| 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment