joltcan · April 3, 2026 08:14
diff --git a/rules.yml b/rules.yml
 groups:
  - name: node_exporters
    rules:
      - alert: HighNetworkTraffic
        expr: sum(rate(node_network_receive_bytes_total[1m]) + rate(node_network_transmit_bytes_total[1m])) > 99000000
        for: 60m
        labels:
          severity: warning
        annotations:
          description: "Total bandwidth is at {{ $value|humanize }}"

      - alert: InstanceHighCpu
        expr: (100 - cpu_idle_percent{dc!="titco"}) > 80
        for: 120m
        labels:
          severity: warning
        annotations:
          description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"

      - alert: InstanceHighCpuNoACS
        expr: (100 - cpu_idle_percent{instance!="somehostwithhighcpuusage"}) > 80
        for: 24h
        labels:
          severity: warning
        annotations:
          description: "High CPU usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"

      - alert: InstanceHighMem
        expr: mem_used_percent > 90
        for: 120m
        labels:
          severity: warning
        annotations:
          description: "High memory usage on {{ $labels.host }} - (currently {{ $value|humanize }}% in use)"

      - alert: service_down
        expr: up == 0 or absent(up)
        for: 120m
        labels:
          severity: critical
        annotations:
          description: '{{ $labels.instance }} service is down.'

      - alert: disk_filled_in_X_hours
        expr: predict_linear(disk_used_percent{device!="loop.*",device!="overlay"}[2h], 48 * 3600) > 90
        for: 2h
        labels:
          severity: warning
        annotations:
          description: "Disk {{ $labels.device }} on {{ $labels.instance }} will run out of space in the coming X hours, (currently {{ $value|humanize}}% in use)"

      - alert: disk_almost_full
        expr: disk_used_percent{device!="loop.*",device!="overlay", host!="somehost"} > 90
        for: 1h
        labels:
          severity: critical
        annotations:
          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)"

      - alert: disk_almost_full_somehist
        expr: disk_used_percent{device!="loop.*",device!="overlay",host="somehost"} > 95
        for: 6h
        labels:
          severity: critical
        annotations:
          description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value|humanize}}% in use)"

  - name: Web (blackbox_exporter)
    rules:
      - alert: site_not_reachable
        expr: absent(probe_success) or probe_success{job!="blackbox_exporter"} == 0
        for: 2h
        labels:
          severity: warning
        annotations:
          description: "Blackbox on {{ $labels.job }} can't reach {{ $labels.instance }}"

      - alert: cert_about_to_expire
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 15 or time()-x509_cert_expiry < 86400 * 15
        for: 240m
        labels:
          severity: warning
        annotations:
          description: Cert is about to expire in {{ $value|humanizeDuration }} on {{ $labels.instance }}

      - alert: web not success
        expr: http_response_result_code{result!="success"}
        for: 30m
        labels:
          severity: warning
        annotations:
          description: "Looks like {{ $labels.server }} on {{ $labels.host }} is having an error. Please investigate."

  - name: Email (needs local config)
    rules:
      - alert: Exim queue length
        expr: absent(exim4_queuelength_value) or exim4_queuelength_value > 10
        for: 1h
        labels:
          severity: warning
        annotations:
          description: Exim queue counter has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with exim -bp

      - alert: Postfix queue length
        expr: absent(postfix_queue_length) or postfix_queue_length> 10
        for: 1h
        labels:
          severity: warning
        annotations:
          description: Postfix queue length has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with mailq, postsuper -d ALL deferred


  - name: Custom pushgateway alerts # cron or custom curl to pushgateway which are then stored in prom
    rules:
      - alert: email_blocked-live_com
        expr: check-ip-blocked > 0
        labels:
          severity: warning
        annotations:
          description: "Looks like {{ $labels.instance }} and IP {{ $labels.ip }} is blocked in https://sendersupport.olc.protection.outlook.com/snds/ipStatus.aspx"

      # cron once per month to make sure it still works
      - alert: alertmanager_test
        expr: alertmanager_test == 0
        labels:
          severity: warning
        annotations:
          description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month"

      # cron once per month to reset above
      - alert: alertmanager_test_crit
        expr: alertmanager_test == 0
        labels:
          severity: critical
        annotations:
          description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value|humanize }}. This alert runs once per month"

      # used together with https://github.com/joltcan/backup-restic
      - alert: Restic
        expr: time() - completed{job="restic"} > 3600*24*3
        labels:
          severity: warning
        annotations:
          description: restic last succeeded {{humanizeDuration $value}} ago on {{ $labels.instance }}

  - name: UPS # I use nut on a linux server
    rules:
      - alert: ups_offline
        expr: network_ups_tools_ups_status{flag="OL"} == 0
        labels:
          severity: error
        annotations:
          description: UPS-monitor on {{ $labels.instance }} just went offline.

      - alert: ups_charge
        expr: network_ups_tools_battery_charge < 30
        labels:
          severity: warning
        annotations:
          description: UPS charge on {{ $labels.instance }} is at {{ $value|humanize }}%

      - alert: ups_load
        expr: network_ups_tools_ups_load > 90
        labels:
          severity: warning
        annotations:
          description: UPS load on {{ $labels.instance }} is at {{ $value|humanize }}%

  - name: ZFS
    rules:
      - alert: zfs_pool_used
        expr: zfs_pool_PERCENT > 90
        for: 6h
        labels:
          severity: warning
        annotations:
          summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }}. Is /usr/local/bin/clean_datasets.sh running?

      - alert: zfs_pool_full
        expr: zfs_pool_PERCENT > 95
        labels:
          severity: error
        annotations:
          summary: zfs pool {{ $labels.NAME }} is at {{ $value|humanize }}% on {{ $labels.instance }}

      - alert: zfs_pool_health
        expr: zfs_pool_HEALTH > 0
        labels:
          severity: error
        annotations:
          summary: zfs pool {{ $labels.NAME }} on {{ $labels.instance }} is in a degraded state

  # These rules convert from raw prom data to more human readable and are then used above
  - name: Converters
    rules:
      - record: network_ups_tools_ups_power
        expr: round(network_ups_tools_ups_load{instance="stor.hemma:9199", job="nut_ups"} * 0.01 * 500 * 0.6)
      - record: zfs_pool_PERCENT
        expr: "(zfs_pool_SIZE-zfs_pool_FREE)/zfs_pool_SIZE*100"
      - record: cpu_idle_percent
        expr: (sum(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(irate(node_cpu_seconds_total[5m])) by (instance)) * 100

  - name: disk_usage_node_exporter
    rules:
      - record: disk_used_percent
        expr: |
          100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))

  - name: memory_usage_node_exporter
    rules:
      - record: mem_used_percent
        expr: |
          100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
	groups:
	- name: node_exporters
	rules:
	- alert: HighNetworkTraffic
	expr: sum(rate(node_network_receive_bytes_total[1m]) + rate(node_network_transmit_bytes_total[1m])) > 99000000
	for: 60m
	labels:
	severity: warning
	annotations:
	description: "Total bandwidth is at {{ $value\|humanize }}"

	- alert: InstanceHighCpu
	expr: (100 - cpu_idle_percent{dc!="titco"}) > 80
	for: 120m
	labels:
	severity: warning
	annotations:
	description: "High CPU usage on {{ $labels.host }} - (currently {{ $value\|humanize }}% in use)"

	- alert: InstanceHighCpuNoACS
	expr: (100 - cpu_idle_percent{instance!="somehostwithhighcpuusage"}) > 80
	for: 24h
	labels:
	severity: warning
	annotations:
	description: "High CPU usage on {{ $labels.host }} - (currently {{ $value\|humanize }}% in use)"

	- alert: InstanceHighMem
	expr: mem_used_percent > 90
	for: 120m
	labels:
	severity: warning
	annotations:
	description: "High memory usage on {{ $labels.host }} - (currently {{ $value\|humanize }}% in use)"

	- alert: service_down
	expr: up == 0 or absent(up)
	for: 120m
	labels:
	severity: critical
	annotations:
	description: '{{ $labels.instance }} service is down.'

	- alert: disk_filled_in_X_hours
	expr: predict_linear(disk_used_percent{device!="loop.",device!="overlay"}[2h], 48 3600) > 90
	for: 2h
	labels:
	severity: warning
	annotations:
	description: "Disk {{ $labels.device }} on {{ $labels.instance }} will run out of space in the coming X hours, (currently {{ $value\|humanize}}% in use)"

	- alert: disk_almost_full
	expr: disk_used_percent{device!="loop.*",device!="overlay", host!="somehost"} > 90
	for: 1h
	labels:
	severity: critical
	annotations:
	description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value\|humanize}}% in use)"

	- alert: disk_almost_full_somehist
	expr: disk_used_percent{device!="loop.*",device!="overlay",host="somehost"} > 95
	for: 6h
	labels:
	severity: critical
	annotations:
	description: "Disk {{ $labels.device }} on {{ $labels.instance }} has less than 5% left (currently {{ $value\|humanize}}% in use)"

	- name: Web (blackbox_exporter)
	rules:
	- alert: site_not_reachable
	expr: absent(probe_success) or probe_success{job!="blackbox_exporter"} == 0
	for: 2h
	labels:
	severity: warning
	annotations:
	description: "Blackbox on {{ $labels.job }} can't reach {{ $labels.instance }}"

	- alert: cert_about_to_expire
	expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 15 or time()-x509_cert_expiry < 86400 * 15
	for: 240m
	labels:
	severity: warning
	annotations:
	description: Cert is about to expire in {{ $value\|humanizeDuration }} on {{ $labels.instance }}

	- alert: web not success
	expr: http_response_result_code{result!="success"}
	for: 30m
	labels:
	severity: warning
	annotations:
	description: "Looks like {{ $labels.server }} on {{ $labels.host }} is having an error. Please investigate."

	- name: Email (needs local config)
	rules:
	- alert: Exim queue length
	expr: absent(exim4_queuelength_value) or exim4_queuelength_value > 10
	for: 1h
	labels:
	severity: warning
	annotations:
	description: Exim queue counter has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with exim -bp

	- alert: Postfix queue length
	expr: absent(postfix_queue_length) or postfix_queue_length> 10
	for: 1h
	labels:
	severity: warning
	annotations:
	description: Postfix queue length has been stuck for a long time on {{ $labels.instance }}, currently at {{ $value }}. Investigate with mailq, postsuper -d ALL deferred


	- name: Custom pushgateway alerts # cron or custom curl to pushgateway which are then stored in prom
	rules:
	- alert: email_blocked-live_com
	expr: check-ip-blocked > 0
	labels:
	severity: warning
	annotations:
	description: "Looks like {{ $labels.instance }} and IP {{ $labels.ip }} is blocked in https://sendersupport.olc.protection.outlook.com/snds/ipStatus.aspx"

	# cron once per month to make sure it still works
	- alert: alertmanager_test
	expr: alertmanager_test == 0
	labels:
	severity: warning
	annotations:
	description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value\|humanize }}. This alert runs once per month"

	# cron once per month to reset above
	- alert: alertmanager_test_crit
	expr: alertmanager_test == 0
	labels:
	severity: critical
	annotations:
	description: "This is a test alert to verify that Alertmanager is working on {{ $labels.instance }} value: {{ $value\|humanize }}. This alert runs once per month"

	# used together with https://github.com/joltcan/backup-restic
	- alert: Restic
	expr: time() - completed{job="restic"} > 3600243
	labels:
	severity: warning
	annotations:
	description: restic last succeeded {{humanizeDuration $value}} ago on {{ $labels.instance }}

	- name: UPS # I use nut on a linux server
	rules:
	- alert: ups_offline
	expr: network_ups_tools_ups_status{flag="OL"} == 0
	labels:
	severity: error
	annotations:
	description: UPS-monitor on {{ $labels.instance }} just went offline.

	- alert: ups_charge
	expr: network_ups_tools_battery_charge < 30
	labels:
	severity: warning
	annotations:
	description: UPS charge on {{ $labels.instance }} is at {{ $value\|humanize }}%

	- alert: ups_load
	expr: network_ups_tools_ups_load > 90
	labels:
	severity: warning
	annotations:
	description: UPS load on {{ $labels.instance }} is at {{ $value\|humanize }}%

	- name: ZFS
	rules:
	- alert: zfs_pool_used
	expr: zfs_pool_PERCENT > 90
	for: 6h
	labels:
	severity: warning
	annotations:
	summary: zfs pool {{ $labels.NAME }} is at {{ $value\|humanize }}% on {{ $labels.instance }}. Is /usr/local/bin/clean_datasets.sh running?

	- alert: zfs_pool_full
	expr: zfs_pool_PERCENT > 95
	labels:
	severity: error
	annotations:
	summary: zfs pool {{ $labels.NAME }} is at {{ $value\|humanize }}% on {{ $labels.instance }}

	- alert: zfs_pool_health
	expr: zfs_pool_HEALTH > 0
	labels:
	severity: error
	annotations:
	summary: zfs pool {{ $labels.NAME }} on {{ $labels.instance }} is in a degraded state

	# These rules convert from raw prom data to more human readable and are then used above
	- name: Converters
	rules:
	- record: network_ups_tools_ups_power
	expr: round(network_ups_tools_ups_load{instance="stor.hemma:9199", job="nut_ups"} * 0.01 * 500 * 0.6)
	- record: zfs_pool_PERCENT
	expr: "(zfs_pool_SIZE-zfs_pool_FREE)/zfs_pool_SIZE*100"
	- record: cpu_idle_percent
	expr: (sum(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(irate(node_cpu_seconds_total[5m])) by (instance)) * 100

	- name: disk_usage_node_exporter
	rules:
	- record: disk_used_percent
	expr: \|
	100 * (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes))

	- name: memory_usage_node_exporter
	rules:
	- record: mem_used_percent
	expr: \|
	100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))
No results found