Created
April 2, 2019 23:17
-
-
Save Gangareddy/07f984003591f45ae42466b3246b2012 to your computer and use it in GitHub Desktop.
Prometheus Rules and Alerts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: monitoring.coreos.com/v1 | |
kind: PrometheusRule | |
metadata: | |
generation: 1 | |
labels: | |
app: prometheus-operator | |
chart: prometheus-operator-0.1.26 | |
heritage: Tiller | |
release: prom | |
name: pod-node-rules | |
namespace: default | |
spec: | |
groups: | |
- name: pod-cpu.rules | |
rules: | |
- expr: | | |
sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace) | |
record: namespace:pod_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum by (namespace, pod_name, container_name) ( | |
rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m]) | |
) | |
record: namespace_pod_name_container_name:pod_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum(container_memory_usage_bytes{job!="", image!="", container_name!=""}) by (namespace) | |
record: namespace:pod_memory_usage_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(rate(container_cpu_usage_seconds_total{job!="", image!="", container_name!=""}[5m])) by (namespace, pod_name) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:pod_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(container_memory_usage_bytes{job!="",image!="", container_name!=""}) by (pod_name, namespace) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:pod_memory_usage_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_memory_bytes{job!=""}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:pod_container_resource_requests_memory_bytes:sum | |
- expr: | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_cpu_cores{job!=""} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{job!=""}, "pod_name", "$1", "pod", "(.*)") | |
) | |
record: namespace_name:pod_container_resource_requests_cpu_cores:sum | |
- name: node.rules | |
rules: | |
- expr: sum(min(kube_pod_info) by (node)) | |
record: ':kube_pod_info_node_count:' | |
- expr: | | |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) | |
record: 'node_namespace_pod:kube_pod_info:' | |
- expr: | | |
count by (node) (sum by (node, cpu) ( | |
node_cpu{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
)) | |
record: node:node_num_cpu:sum | |
- expr: | | |
1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) | |
record: :node_cpu_utilisation:avg1m | |
- expr: | | |
1 - avg by (node) ( | |
rate(node_cpu{job="node-exporter",mode="idle"}[1m]) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info:) | |
record: node:node_cpu_utilisation:avg1m | |
- expr: | | |
sum(node_load1{job="node-exporter"}) | |
/ | |
sum(node:node_num_cpu:sum) | |
record: ':node_cpu_saturation_load1:' | |
- expr: | | |
sum by (node) ( | |
node_load1{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
node:node_num_cpu:sum | |
record: 'node:node_cpu_saturation_load1:' | |
- expr: | | |
1 - | |
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
/ | |
sum(node_memory_MemTotal{job="node-exporter"}) | |
record: ':node_memory_utilisation:' | |
- expr: | | |
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
record: :node_memory_MemFreeCachedBuffers:sum | |
- expr: | | |
sum(node_memory_MemTotal{job="node-exporter"}) | |
record: :node_memory_MemTotal:sum | |
- expr: | | |
sum by (node) ( | |
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_bytes_available:sum | |
- expr: | | |
sum by (node) ( | |
node_memory_MemTotal{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_bytes_total:sum | |
- expr: | | |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) | |
/ | |
scalar(sum(node:node_memory_bytes_total:sum)) | |
record: node:node_memory_utilisation:ratio | |
- expr: | | |
1e3 * sum( | |
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
) | |
record: :node_memory_swap_io_bytes:sum_rate | |
- expr: | | |
1 - | |
sum by (node) ( | |
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
sum by (node) ( | |
node_memory_MemTotal{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: 'node:node_memory_utilisation:' | |
- expr: | | |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) | |
record: 'node:node_memory_utilisation_2:' | |
- expr: | | |
1e3 * sum by (node) ( | |
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_memory_swap_io_bytes:sum_rate | |
- expr: | | |
avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
record: :node_disk_utilisation:avg_irate | |
- expr: | | |
avg by (node) ( | |
irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_utilisation:avg_irate | |
- expr: | | |
avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
record: :node_disk_saturation:avg_irate | |
- expr: | | |
avg by (node) ( | |
irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_disk_saturation:avg_irate | |
- expr: | | |
max by (namespace, pod, device) ((node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"} | |
- node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
/ node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_usage:' | |
- expr: | | |
max by (namespace, pod, device) (node_filesystem_avail{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size{fstype=~"ext[234]|btrfs|xfs|zfs"}) | |
record: 'node:node_filesystem_avail:' | |
- expr: | | |
sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
record: :node_net_utilisation:sum_irate | |
- expr: | | |
sum by (node) ( | |
(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_net_utilisation:sum_irate | |
- expr: | | |
sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
record: :node_net_saturation:sum_irate | |
- expr: | | |
sum by (node) ( | |
(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
record: node:node_net_saturation:sum_irate | |
- name: pod-apps | |
rules: | |
- alert: K8SPodCrashLooping | |
annotations: | |
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
}}) is restarting {{ printf "%.2f" $value }} times / second. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping | |
expr: | | |
rate(kube_pod_container_status_restarts_total{job!=""}[15m]) > 0 | |
for: 1h | |
labels: | |
severity: critical | |
- alert: K8SPodNotReady | |
annotations: | |
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready | |
state for longer than an hour. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready | |
expr: | | |
sum by (namespace, pod) (kube_pod_status_phase{job!="", phase=~"Pending|Unknown"}) > 0 | |
for: 1h | |
labels: | |
severity: critical | |
- alert: K8SDeploymentGenerationMismatch | |
annotations: | |
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment | |
}} does not match, this indicates that the Deployment has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch | |
expr: | | |
kube_deployment_status_observed_generation{job!=""} | |
!= | |
kube_deployment_metadata_generation{job!=""} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: K8SDeploymentReplicasMismatch | |
annotations: | |
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not | |
matched the expected number of replicas for longer than an hour. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch | |
expr: | | |
kube_deployment_spec_replicas{job!=""} | |
!= | |
kube_deployment_status_replicas_available{job!=""} | |
for: 1h | |
labels: | |
severity: critical | |
- alert: K8SKubeStatefulSetReplicasMismatch | |
annotations: | |
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has | |
not matched the expected number of replicas for longer than 15 minutes. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch | |
expr: | | |
kube_statefulset_status_replicas_ready{job=""} | |
!= | |
kube_statefulset_status_replicas{job=""} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: K8SStatefulSetGenerationMismatch | |
annotations: | |
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset | |
}} does not match, this indicates that the StatefulSet has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch | |
expr: | | |
kube_statefulset_status_observed_generation{job!=""} | |
!= | |
kube_statefulset_metadata_generation{job!=""} | |
for: 15m | |
labels: | |
severity: critical | |
- alert: K8SStatefulSetUpdateNotRolledOut | |
annotations: | |
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update | |
has not been rolled out. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout | |
expr: | | |
max without (revision) ( | |
kube_statefulset_status_current_revision{job!=""} | |
unless | |
kube_statefulset_status_update_revision{job!=""} | |
) | |
* | |
( | |
kube_statefulset_replicas{job!=""} | |
!= | |
kube_statefulset_status_replicas_updated{job!=""} | |
) | |
for: 15m | |
labels: | |
severity: critical | |
- alert: K8SDaemonSetRolloutStuck | |
annotations: | |
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace | |
}}/{{ $labels.daemonset }} are scheduled and ready. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck | |
expr: | | |
kube_daemonset_status_number_ready{job!=""} | |
/ | |
kube_daemonset_status_desired_number_scheduled{job!=""} * 100 < 100 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: K8SDaemonSetNotScheduled | |
annotations: | |
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are not scheduled.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled | |
expr: | | |
kube_daemonset_status_desired_number_scheduled{job!=""} | |
- | |
kube_daemonset_status_current_number_scheduled{job!=""} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: K8SDaemonSetMisScheduled | |
annotations: | |
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are running where they are not supposed to run.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled | |
expr: | | |
kube_daemonset_status_number_misscheduled{job!=""} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: K8SCronJobRunning | |
annotations: | |
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more | |
than 1h to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning | |
expr: | | |
time() - kube_cronjob_next_schedule_time{job!=""} > 3600 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: K8SJobCompletion | |
annotations: | |
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more | |
than one hour to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion | |
expr: | | |
kube_job_spec_completions{job!=""} - kube_job_status_succeeded{job!=""} > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: K8SJobFailed | |
annotations: | |
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed | |
expr: | | |
kube_job_status_failed{job!=""} > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- name: k8s-resources | |
rules: | |
- alert: K8SCPUOvercommit | |
annotations: | |
message: Cluster has overcommitted CPU resource requests for Pods and cannot | |
tolerate node failure. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
expr: | | |
sum(namespace_name:pod_container_resource_requests_cpu_cores:sum) | |
/ | |
sum(node:node_num_cpu:sum) | |
> | |
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: K8SMemOvercommit | |
annotations: | |
message: Cluster has overcommitted memory resource requests for Pods and cannot | |
tolerate node failure. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
expr: | | |
sum(namespace_name:pod_container_resource_requests_memory_bytes:sum) | |
/ | |
sum(node_memory_MemTotal) | |
> | |
(count(node:node_num_cpu:sum)-1) | |
/ | |
count(node:node_num_cpu:sum) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: K8SCPUOvercommit | |
annotations: | |
message: Cluster has overcommitted CPU resource requests for Namespaces. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit | |
expr: | | |
sum(kube_resourcequota{job!="", type="hard", resource="requests.cpu"}) | |
/ | |
sum(node:node_num_cpu:sum) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: K8SMemOvercommit | |
annotations: | |
message: Cluster has overcommitted memory resource requests for Namespaces. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit | |
expr: | | |
sum(kube_resourcequota{job!="", type="hard", resource="requests.memory"}) | |
/ | |
sum(node_memory_MemTotal{job="node-exporter"}) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: K8SQuotaExceeded | |
annotations: | |
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value | |
}}% of its {{ $labels.resource }} quota. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded | |
expr: | | |
100 * kube_resourcequota{job!="", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{job!="", type="hard"} > 0) | |
> 90 | |
for: 15m | |
labels: | |
severity: warning | |
- name: kubernetes-storage | |
rules: | |
- alert: K8SPersistentVolumeUsageCritical | |
annotations: | |
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.0f" $value | |
}}% free. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical | |
expr: | | |
100 * kubelet_volume_stats_available_bytes{job!=""} | |
/ | |
kubelet_volume_stats_capacity_bytes{job!=""} | |
< 20 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: K8SPersistentVolumeFullInSevenDays | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
days. Currently {{ $value }} bytes are available. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
expr: | | |
kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 7 * 24 * 3600) < 0 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: K8SPersistentVolumeFullInThirtyDays | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is expected to fill up within four | |
days. Currently {{ $value }} bytes are available. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
expr: | | |
kubelet_volume_stats_available_bytes{job!=""} and predict_linear(kubelet_volume_stats_available_bytes{job!=""}[6h], 30 * 24 * 3600) < 0 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: K8SPersistentVolumeBeingDeleted | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed are deleted in {{ $labels.namespace }} . | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (persistentvolumeclaim) < 0 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: K8SPersistentVolumeNonePresentinNamespace | |
annotations: | |
annotations: | |
message: Based on recent sampling, the PersistentVolume claimed are not found in namespace {{ $labels.namespace }} . | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays | |
expr: count(kubelet_volume_stats_used_bytes{job!=""}) by (namespace) <= 0 | |
for: 5m | |
labels: | |
severity: warning | |
- name: k8s-system | |
rules: | |
- alert: K8SNodeNotReady | |
annotations: | |
message: '{{ $labels.node }} has been unready for more than an hour.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready | |
expr: | | |
kube_node_status_condition{job!="",condition="Ready",status="true"} == 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: K8SVersionMismatch | |
annotations: | |
message: There are {{ $value }} different versions of Kubernetes components | |
running. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch | |
expr: | | |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: K8SClientErrors | |
annotations: | |
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ printf "%0.0f" $value }}% errors.' | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
expr: | | |
(sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job) | |
/ | |
sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
* 100 > 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: K8SClientErrors | |
annotations: | |
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance}}' is experiencing {{ printf "%0.0f" $value }} errors / second. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors | |
expr: | | |
sum(rate(ksm_scrape_error_total{job!=""}[5m])) by (instance, job) > 0.1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: K8STooManyPods | |
annotations: | |
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close | |
to the limit of 110. | |
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods | |
expr: | | |
kubelet_running_pod_count{job!=""} > 110 * 0.9 | |
for: 15m | |
labels: | |
severity: warning |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment