Last active
September 23, 2022 15:13
-
-
Save mjf/07dcfa12db1f16228c398f0317266484 to your computer and use it in GitHub Desktop.
Prometheus Recoding and Alert Rules Collection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Prometheus Recoding and Alert Rules Collection | |
# Copyright (C) 2017 Matous Jan Fialka, <http://mjf.cz/> | |
# Released under the terms of The MIT License | |
groups: | |
- name: node_common | |
interval: 30s | |
rules: | |
- alert: processor_usage_too_high | |
expr: | | |
((sum(node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}) by (instance, job)) - (sum(node_cpu{mode=~"^(?:^(?:idle|iowait)$)$"}) by (instance, job))) / (sum(node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}) by (instance, job)) * 100 > 95 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has processor above 95% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes' | |
summary: 'Processor usage above 95%' | |
- alert: swap_usage_above_50_percent | |
expr: | | |
(((node_memory_SwapTotal - node_memory_SwapFree) / node_memory_SwapTotal) * 100) > 50 | |
for: 1h | |
labels: | |
severity: moderate | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has swap usage above 20% (current value: {{ printf "%.2f" $value }}%) for over 1 hour' | |
summary: 'Swap usage above 20%' | |
- alert: memory_usage_above_90_percent | |
expr: | | |
(((node_memory_MemTotal - node_memory_MemFree - node_memory_Cached) / (node_memory_MemTotal) * 100)) > 95 | |
for: 5m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has memory usage above 90% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes' | |
summary: 'Memory usage above 90%' | |
- alert: node_down | |
expr: | | |
up == 0 | |
for: 1m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 1 minute' | |
summary: 'Node down' | |
- name: node_predictions | |
interval: 30s | |
rules: | |
- record: instance:fd_utilization | |
expr: | | |
process_open_fds / process_max_fds | |
- alert: file_descriptors_exhausted_in_4_hours | |
expr: | | |
predict_linear(instance:fd_utilization[1h], 4 * 3600) > 1 | |
for: 10m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} will have file descriptors exhausted in 4 hours' | |
summary: 'File descriptors will be exhausted soon' | |
- alert: disk_space_exhausted_in_8_hours | |
expr: | | |
predict_linear(node_filesystem_free[1h], 8 * 3600) < 0 | |
for: 20m | |
labels: | |
severity: moderate | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} will have disk space exhausted in 8 hours' | |
summary: 'Disk space will be exhausted soon' | |
- alert: disk_space_almost_exhausted | |
expr: | | |
node_filesystem_avail / node_filesystem_size * 100 <= 10 | |
for: 15m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has disk space less than 10% (current value: {{ printf "%.2f" $value }}%) for 15 minutes' | |
summary: 'Disk space almost exhausted' | |
- name: service | |
interval: 15s | |
rules: | |
- alert: service_down | |
expr: | | |
{__name__=~"^(?:[^_]+_up)$"} == 0 | |
for: 3m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for over 3 minutes' | |
summary: 'Service down' | |
- alert: service_flapping | |
expr: | | |
changes({__name__=~"^(?:[^_]+_up)$"}[5m]) > 5 or (changes({__name__=~"^(?:[^_]+_up)$"}[60m]) > 15 unless changes({__name__=~"^(?:[^_]+_up)$"}[30m]) < 7) | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} is flapping' | |
summary: 'Service flapping' | |
- name: systemd_unit | |
interval: 15s | |
rules: | |
- alert: systemd_unit_failed | |
expr: | | |
node_systemd_unit_state{state="failed"} > 0 | |
for: 3m | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} failed' | |
summary: 'Systemd unit failed' | |
- alert: systemd_unit_flapping | |
expr: | | |
changes(node_systemd_unit_state{state="active"}[5m]) > 5 or (changes(node_systemd_unit_state{state="active"}[60m]) > 15 unless changes(node_systemd_unit_state{state="active"}[30m]) < 7) | |
labels: | |
severity: critical | |
annotations: | |
description: 'Instance {{ $labels.instance }}: Service {{ $labels.name }} flapping' | |
summary: 'Systemd unit flapping' | |
- name: mysql | |
interval: 30s | |
rules: | |
- record: instance:mysql_estimated_max_used_mem_size | |
expr: | | |
(mysql_global_variables_key_buffer_size + mysql_global_variables_query_cache_size + mysql_global_variables_tmp_table_size + mysql_global_variables_innodb_buffer_pool_size + (mysql_global_variables_innodb_additional_mem_pool_size or up * 0) + mysql_global_variables_innodb_log_buffer_size + (mysql_global_variables_max_connections * (mysql_global_variables_sort_buffer_size + mysql_global_variables_read_buffer_size + mysql_global_variables_read_rnd_buffer_size + mysql_global_variables_join_buffer_size + mysql_global_variables_thread_stack + mysql_global_variables_binlog_cache_size))) | |
- record: job:mysql_transactions:rate5m | |
expr: | | |
sum(rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m])) without(command) | |
- alert: mysql_innodb_log_waits | |
expr: | | |
rate(mysql_global_status_innodb_log_waits[5m]) > 10 | |
labels: | |
severity: critical | |
annotations: | |
description: 'The MySQL InnoDB logs are waiting for disk at a rate of {{ printf "%.2f" $value }} per second for over 5 minutes' | |
summary: 'MySQL InnoDB log waits' | |
- name: ntp | |
interval: 15s | |
rules: | |
- alert: ntp_drifting | |
expr: | | |
node_ntp_drift_seconds > 0.05 | |
for: 1m | |
labels: | |
severity: critical | |
annotations: | |
description: 'The NTP drifting has been too high for over 1 minute' | |
summary: 'NTP drifting too high' | |
- alert: ntp_drifting | |
expr: | | |
node_ntp_drift_seconds > 0.01 | |
for: 1m | |
labels: | |
severity: moderate | |
annotations: | |
description: 'The NTP has been drifting for over 1 minute' | |
summary: 'NTP drift' | |
# vi:ft=yaml:nowrap: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment