Skip to content

Instantly share code, notes, and snippets.

@prologic
Created April 11, 2022 22:34
Show Gist options
  • Save prologic/9d7fa7beee04abb043b7d159fcbc7a37 to your computer and use it in GitHub Desktop.
Save prologic/9d7fa7beee04abb043b7d159fcbc7a37 to your computer and use it in GitHub Desktop.
Prometheus AlertRules
---
groups:
- name: msgbus
interval: 15s
rules:
- alert: zero_subscribers
expr: |
msgbus_bus_subscribers == 0
for: 30s
annotations:
summary: "{{ $labels.instance }}"
description: "{{ $labels.instance }} has zero subscribers"
- name: probes
interval: 15s
rules:
- alert: probe_failing
expr: "absent(probe_success == 1)"
for: 5m
annotations:
summary: "{{ $labels.instance }}"
descirption: "probe {{ $labels.instance }} is failing"
- name: containers
interval: 15s
rules:
- alert: container_crashing
expr: |
count by (instance, container_label_com_docker_swarm_service_name)
(
count_over_time(container_last_seen{container_label_com_docker_swarm_service_name!=""}[15m])
) - 1 >= 3
for: 5m
annotations:
summary: "{{ $labels.container_label_com_docker_swarm_service_name }}"
description: "{{ $labels.container_label_com_docker_swarm_service_name }} on {{ $labels.instance }} is crashing"
- alert: memory_pressure
expr: |
sum(
(
container_memory_working_set_bytes{image!="", container_label_com_docker_swarm_task_name!=""}
/
container_spec_memory_limit_bytes{image!="", container_label_com_docker_swarm_task_name!=""}
) * 100 != +Inf
) by (container_label_com_docker_swarm_task_name) > 80
for: 5m
annotations:
summary: "{{ $labels.container_label_com_docker_swarm_task_name }}"
description: "{{ $labels.container_label_com_docker_swarm_task_name }} on {{ $labels.instance }} is under memory pressure of {{ humanize $value }}%"
- alert: cpu_throttled
expr: |
sum(
rate(container_cpu_cfs_throttled_seconds_total{image!="", container_label_com_docker_swarm_task_name!=""}[5m]) * 100
)
by (container_label_com_docker_swarm_task_name)
> 120
for: 5m
annotations:
summary: "{{ $labels.container_label_com_docker_swarm_task_name }}"
description: "{{ $labels.container_label_com_docker_swarm_task_name }} is being cpu throttled at {{ humanize $value }}%"
- alert: degraded_service
expr: |
(swarm_service_desired_replicas - on(service) swarm_service_replicas_state{state="running"}) != 0
for: 5m
annotations:
summary: "{{ $labels.service }}"
description: "service {{ $labels.service }} on {{ $labels.instance }} is degraded"
- name: nodes
interval: 15s
rules:
- alert: node_down
expr: up{job="docker_nodes"} == 0
for: 5m
annotations:
summary: "{{ $labels.instance }}"
description: "node {{ $labels.instance }} is down"
- alert: disk_usage
expr: (sum(((1 - (node_filesystem_avail_bytes{job="docker_nodes", mountpoint="/"} / node_filesystem_size_bytes{job="docker_nodes", mountpoint="/"})) * 100) * on(instance) group_left(node_id, node_name) node_meta) by (node_name)) > 80
for: 5m
annotations:
summary: "{{ $labels.node_name }} {{ $value }}"
description: "disk usage is at {{ humanize $value }}% on {{ $labels.node_name }}"
- alert: memory_usage
expr: (sum((node_memory_MemAvailable_bytes{job="docker_nodes"} / node_memory_MemTotal_bytes{job="docker_nodes"} * 100) * on(instance) group_left(node_id, node_name) node_meta) by (node_name)) > 80
for: 5m
annotations:
summary: "{{ $labels.node_name }} {{ $value }}"
description: "memory usage is at {{ humanize $value }}}% on {{ $labels.node_name }}"
- alert: cpu_usage
expr: (sum((100 - (avg by (instance) (irate(node_cpu_seconds_total{job="docker_nodes",mode="idle"}[5m])) * 100)) * on(instance) group_left(node_id, node_name) node_meta) by (node_name)) > 80
for: 15m
annotations:
summary: "{{ $labels.node_naem }} {{ $value }}"
description: "cpu usage is at {{ humanize $value }}% on {{ $labels.node_name }}"
- name: traefik
interval: 15s
rules:
- alert: all_services_down
expr: count(traefik_service_server_up) by (service) == 0
for: 1m
labels:
severity: critical
annotations:
summary: "ALL SERVICES DOWN: {{ $labels.instance }})"
description: "all services are down on {{ $labels.instance }}"
- alert: 5xx_errors
expr: |
label_replace(
sum(
rate(traefik_service_requests_total{protocol="http", code=~"5.*"}[5m])
) by (service),
"short_label", "$1", "service", "(.*)@.*"
) > 0.002
for: 5m
annotations:
summary: "{{ $labels.short_label }}"
description: "error rate for {{ $labels.short_label }} is at {{ $value }}"
- alert: p95_latency
expr: |
label_replace(
histogram_quantile(
0.95,
sum(
rate(traefik_service_request_duration_seconds_bucket{protocol="http"}[5m])
) by (service, le)
),
"short_label", "$1", "service", "(.*)@.*"
) > 1.5
for: 5m
annotations:
summary: "{{ $labels.short_label }}"
description: "p95 latency for {{ $labels.short_label }} is at {{ $value }}s"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment