Skip to content

Instantly share code, notes, and snippets.

@Gangareddy
Created April 2, 2019 23:17
Show Gist options
  • Save Gangareddy/07f984003591f45ae42466b3246b2012 to your computer and use it in GitHub Desktop.
Save Gangareddy/07f984003591f45ae42466b3246b2012 to your computer and use it in GitHub Desktop.
Prometheus Rules and Alerts
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
generation: 1
labels:
app: prometheus-operator
chart: prometheus-operator-0.1.26
heritage: Tiller
release: prom
name: pod-node-rules
namespace: default
spec:
groups:
- name: pod-cpu.rules
rules:
- expr: |
sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace)
record: namespace:pod_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, pod_name, container_name) (
rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])
)
record: namespace_pod_name_container_name:pod_cpu_usage_seconds_total:sum_rate
- expr: |
sum(container_memory_usage_bytes{job!="", image!="", container_name!=""}) by (namespace)
record: namespace:pod_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace, pod_name)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:pod_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job!="",image!="", container_name!=""}) by (pod_name, namespace)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:pod_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{job!=""}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:pod_container_resource_requests_memory_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{job!=""} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:pod_container_resource_requests_cpu_cores:sum
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
record: ':kube_pod_info_node_count:'
- expr: |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
node_cpu{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: |
1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilisation:avg1m
- expr: |
1 - avg by (node) (
rate(node_cpu{job="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilisation:avg1m
- expr: |
sum(node_load1{job="node-exporter"})
/
sum(node:node_num_cpu:sum)
record: ':node_cpu_saturation_load1:'
- expr: |
sum by (node) (
node_load1{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
record: 'node:node_cpu_saturation_load1:'
- expr: |
1 -
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
/
sum(node_memory_MemTotal{job="node-exporter"})
record: ':node_memory_utilisation:'
- expr: |
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
record: :node_memory_MemFreeCachedBuffers:sum
- expr: |
sum(node_memory_MemTotal{job="node-exporter"})
record: :node_memory_MemTotal:sum
- expr: |
sum by (node) (
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |
sum by (node) (
node_memory_MemTotal{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_total:sum
- expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
record: node:node_memory_utilisation:ratio
- expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
)
record: :node_memory_swap_io_bytes:sum_rate
- expr: |
1 -
sum by (node) (
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: 'node:node_memory_utilisation:'
- expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
record: 'node:node_memory_utilisation_2:'
- expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
record: :node_disk_utilisation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |
max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: |
max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |
sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m]))
record: :node_net_utilisation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilisation:sum_irate
- expr: |
sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_saturation:sum_irate
- name: pod-apps
rules:
- alert: K8SPodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |
rate(kube_pod_container_status_restarts_total{job!=""}[15m]) > 0
for: 1h
labels:
severity: critical
- alert: K8SPodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |
sum by (namespace, pod) (kube_pod_status_phase{job!="", phase=~"Pending|Unknown"}) > 0
for: 1h
labels:
severity: critical
- alert: K8SDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job!=""}
!=
kube_deployment_metadata_generation{job!=""}
for: 15m
labels:
severity: critical
- alert: K8SDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: |
kube_deployment_spec_replicas{job!=""}
!=
kube_deployment_status_replicas_available{job!=""}
for: 1h
labels:
severity: critical
- alert: K8SKubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: |
kube_statefulset_status_replicas_ready{job=""}
!=
kube_statefulset_status_replicas{job=""}
for: 15m
labels:
severity: critical
- alert: K8SStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job!=""}
!=
kube_statefulset_metadata_generation{job!=""}
for: 15m
labels:
severity: critical
- alert: K8SStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job!=""}
unless
kube_statefulset_status_update_revision{job!=""}
)
*
(
kube_statefulset_replicas{job!=""}
!=
kube_statefulset_status_replicas_updated{job!=""}
)
for: 15m
labels:
severity: critical
- alert: K8SDaemonSetRolloutStuck
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job!=""}
/
kube_daemonset_status_desired_number_scheduled{job!=""} * 100 < 100
for: 15m
labels:
severity: critical
- alert: K8SDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job!=""}
-
kube_daemonset_status_current_number_scheduled{job!=""} > 0
for: 10m
labels:
severity: warning
- alert: K8SDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: |
kube_daemonset_status_number_misscheduled{job!=""} > 0
for: 10m
labels:
severity: warning
- alert: K8SCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
than 1h to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
expr: |
time() - kube_cronjob_next_schedule_time{job!=""} > 3600
for: 1h
labels:
severity: warning
- alert: K8SJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job!=""} - kube_job_status_succeeded{job!=""} > 0
for: 1h
labels:
severity: warning
- alert: K8SJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
expr: |
kube_job_status_failed{job!=""} > 0
for: 1h
labels:
severity: warning
- name: k8s-resources
rules:
- alert: K8SCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(namespace_name:pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: K8SMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: |
sum(namespace_name:pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: K8SCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |
sum(kube_resourcequota{job!="", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
- alert: K8SMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: |
sum(kube_resourcequota{job!="", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal{job="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
- alert: K8SQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: |
100 * kube_resourcequota{job!="", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job!="", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
- name: kubernetes-storage
rules:
- alert: K8SPersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value
}}% free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: |
100 * kubelet_volume_stats_available_bytes{job!=""}
/
kubelet_volume_stats_capacity_bytes{job!=""}
< 20
for: 1m
labels:
severity: critical
- alert: K8SPersistentVolumeFullInSevenDays
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value }} bytes are available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 7 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
- alert: K8SPersistentVolumeFullInThirtyDays
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value }} bytes are available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: |
kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 30 * 24 * 3600) < 0
for: 5m
labels:
severity: warning
- alert: K8SPersistentVolumeBeingDeleted
annotations:
message: Based on recent sampling, the PersistentVolume claimed are deleted in {{ $labels.namespace }} .
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (persistentvolumeclaim) < 0
for: 5m
labels:
severity: critical
- alert: K8SPersistentVolumeNonePresentinNamespace
annotations:
annotations:
message: Based on recent sampling, the PersistentVolume claimed are not found in namespace {{ $labels.namespace }} .
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (namespace) <= 0
for: 5m
labels:
severity: warning
- name: k8s-system
rules:
- alert: K8SNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
expr: |
kube_node_status_condition{job!="",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
- alert: K8SVersionMismatch
annotations:
message: There are {{ $value }} different versions of Kubernetes components
running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
for: 1h
labels:
severity: warning
- alert: K8SClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
for: 15m
labels:
severity: warning
- alert: K8SClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
sum(rate(ksm_scrape_error_total{job!=""}[5m])) by (instance, job) > 0.1
for: 15m
labels:
severity: warning
- alert: K8STooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
to the limit of 110.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
kubelet_running_pod_count{job!=""} > 110 * 0.9
for: 15m
labels:
severity: warning
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment