Skip to content

Instantly share code, notes, and snippets.

@liuerfire
Created May 20, 2021 10:45
Show Gist options
  • Save liuerfire/9170c367c95e25a41a344203f979e1c6 to your computer and use it in GitHub Desktop.
Save liuerfire/9170c367c95e25a41a344203f979e1c6 to your computer and use it in GitHub Desktop.
{
prometheusAlerts+:: {
groups+: [
{
name: 'node-exporter',
rules: [
{
alert: 'HostOutOfMemory',
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host out of memory (instance {{ $labels.instance }})',
description: 'Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostMemoryUnderMemoryPressure',
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host memory under memory pressure (instance {{ $labels.instance }})',
description: 'The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostUnusualNetworkThroughputIn',
expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100',
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
description: 'Host network interfaces are probably receiving too much data (> 100 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }}',
summary: 'Host unusual network throughput in (instance {{ $labels.instance }})',
},
},
{
alert: 'HostUnusualNetworkThroughputOut',
expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100',
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host unusual network throughput out (instance {{ $labels.instance }})',
description: 'Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostUnusualDiskReadRate',
expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50',
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host unusual disk read rate (instance {{ $labels.instance }})',
description: 'Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostUnusualDiskWriteRate',
expr: 'sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host unusual disk write rate (instance {{ $labels.instance }})',
description: 'Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostOutOfDiskSpace',
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host out of disk space (instance {{ $labels.instance }})',
description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostDiskWillFillIn24Hours',
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host disk will fill in 24 hours (instance {{ $labels.instance }})',
description: 'Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostOutOfInodes',
expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host out of inodes (instance {{ $labels.instance }})',
description: 'Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostInodesWillFillIn24Hours',
expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host inodes will fill in 24 hours (instance {{ $labels.instance }})',
description: 'Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostUnusualDiskReadLatency',
expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host unusual disk read latency (instance {{ $labels.instance }})',
description: 'Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostUnusualDiskWriteLatency',
expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host unusual disk write latency (instance {{ $labels.instance }})',
description: 'Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostHighCpuLoad',
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80',
'for': '0m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host high CPU load (instance {{ $labels.instance }})',
description: 'CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostOomKillDetected',
expr: 'increase(node_vmstat_oom_kill[1m]) > 0',
'for': '0m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host OOM kill detected (instance {{ $labels.instance }})',
description: 'OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostNetworkReceiveErrors',
expr: 'rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host Network Receive Errors (instance {{ $labels.instance }})',
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostNetworkTransmitErrors',
expr: 'rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01',
'for': '2m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host Network Transmit Errors (instance {{ $labels.instance }})',
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
{
alert: 'HostConntrackLimit',
expr: 'node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8',
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Host conntrack limit (instance {{ $labels.instance }})',
description: 'The number of conntrack is approching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}',
},
},
],
},
],
},
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment