Created
May 19, 2021 23:39
-
-
Save garygan89/00e2dc87374cf5be18ffe2529c20f75d to your computer and use it in GitHub Desktop.
Kubernetes Prometheus Rule without KubeScheduler and KubeController rules, and with namespace 'monitoring' added so that alertmanager custom receiver can receive the alerts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: monitoring.coreos.com/v1 | |
kind: PrometheusRule | |
metadata: | |
labels: | |
app.kubernetes.io/name: kube-prometheus | |
app.kubernetes.io/part-of: kube-prometheus | |
prometheus: k8s | |
role: alert-rules | |
name: kubernetes-monitoring-rules | |
namespace: monitoring | |
spec: | |
groups: | |
- name: kubernetes-apps | |
rules: | |
- alert: KubePodCrashLooping | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
}}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping | |
summary: Pod is crash looping. | |
expr: | | |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) * 60 * 5 > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubePodNotReady | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready | |
state for longer than 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready | |
summary: Pod has been in a non-ready state for more than 15 minutes. | |
expr: | | |
sum by (namespace, pod) ( | |
max by(namespace, pod) ( | |
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} | |
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( | |
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) | |
) | |
) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeDeploymentGenerationMismatch | |
annotations: | |
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment | |
}} does not match, this indicates that the Deployment has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch | |
summary: Deployment generation mismatch due to possible roll-back | |
expr: | | |
kube_deployment_status_observed_generation{job="kube-state-metrics"} | |
!= | |
kube_deployment_metadata_generation{job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeDeploymentReplicasMismatch | |
annotations: | |
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has | |
not matched the expected number of replicas for longer than 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch | |
summary: Deployment has not matched the expected number of replicas. | |
expr: | | |
( | |
kube_deployment_spec_replicas{job="kube-state-metrics"} | |
!= | |
kube_deployment_status_replicas_available{job="kube-state-metrics"} | |
) and ( | |
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeStatefulSetReplicasMismatch | |
annotations: | |
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} | |
has not matched the expected number of replicas for longer than 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch | |
summary: Deployment has not matched the expected number of replicas. | |
expr: | | |
( | |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas{job="kube-state-metrics"} | |
) and ( | |
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeStatefulSetGenerationMismatch | |
annotations: | |
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset | |
}} does not match, this indicates that the StatefulSet has failed but has | |
not been rolled back. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch | |
summary: StatefulSet generation mismatch due to possible roll-back | |
expr: | | |
kube_statefulset_status_observed_generation{job="kube-state-metrics"} | |
!= | |
kube_statefulset_metadata_generation{job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeStatefulSetUpdateNotRolledOut | |
annotations: | |
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} | |
update has not been rolled out. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout | |
summary: StatefulSet update has not been rolled out. | |
expr: | | |
( | |
max without (revision) ( | |
kube_statefulset_status_current_revision{job="kube-state-metrics"} | |
unless | |
kube_statefulset_status_update_revision{job="kube-state-metrics"} | |
) | |
* | |
( | |
kube_statefulset_replicas{job="kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas_updated{job="kube-state-metrics"} | |
) | |
) and ( | |
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeDaemonSetRolloutStuck | |
annotations: | |
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has | |
not finished or progressed for at least 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck | |
summary: DaemonSet rollout is stuck. | |
expr: | | |
( | |
( | |
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} | |
) or ( | |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} | |
!= | |
0 | |
) or ( | |
kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} | |
) or ( | |
kube_daemonset_status_number_available{job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} | |
) | |
) and ( | |
changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeContainerWaiting | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} | |
has been in waiting state for longer than 1 hour. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting | |
summary: Pod container waiting longer than 1 hour | |
expr: | | |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 | |
for: 1h | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeDaemonSetNotScheduled | |
annotations: | |
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are not scheduled.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled | |
summary: DaemonSet pods are not scheduled. | |
expr: | | |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} | |
- | |
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeDaemonSetMisScheduled | |
annotations: | |
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are running where they are not supposed to run.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled | |
summary: DaemonSet pods are misscheduled. | |
expr: | | |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeJobCompletion | |
annotations: | |
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking | |
more than 12 hours to complete. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion | |
summary: Job did not complete in time | |
expr: | | |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 | |
for: 12h | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeJobFailed | |
annotations: | |
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to | |
complete. Removing failed job after investigation should clear this alert. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed | |
summary: Job failed to complete. | |
expr: | | |
kube_job_failed{job="kube-state-metrics"} > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeHpaReplicasMismatch | |
annotations: | |
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched | |
the desired number of replicas for longer than 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch | |
summary: HPA has not matched descired number of replicas. | |
expr: | | |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"} | |
!= | |
kube_hpa_status_current_replicas{job="kube-state-metrics"}) | |
and | |
(kube_hpa_status_current_replicas{job="kube-state-metrics"} | |
> | |
kube_hpa_spec_min_replicas{job="kube-state-metrics"}) | |
and | |
(kube_hpa_status_current_replicas{job="kube-state-metrics"} | |
< | |
kube_hpa_spec_max_replicas{job="kube-state-metrics"}) | |
and | |
changes(kube_hpa_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeHpaMaxedOut | |
annotations: | |
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running | |
at max replicas for longer than 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout | |
summary: HPA is running at max replicas | |
expr: | | |
kube_hpa_status_current_replicas{job="kube-state-metrics"} | |
== | |
kube_hpa_spec_max_replicas{job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- name: kubernetes-resources | |
rules: | |
- alert: KubeCPUOvercommit | |
annotations: | |
description: Cluster has overcommitted CPU resource requests for Pods and | |
cannot tolerate node failure. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit | |
summary: Cluster has overcommitted CPU resource requests. | |
expr: | | |
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) | |
/ | |
sum(kube_node_status_allocatable{resource="cpu"}) | |
> | |
((count(kube_node_status_allocatable{resource="cpu"}) > 1) - 1) / count(kube_node_status_allocatable{resource="cpu"}) | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeMemoryOvercommit | |
annotations: | |
description: Cluster has overcommitted memory resource requests for Pods and | |
cannot tolerate node failure. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit | |
summary: Cluster has overcommitted memory resource requests. | |
expr: | | |
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) | |
/ | |
sum(kube_node_status_allocatable{resource="memory"}) | |
> | |
((count(kube_node_status_allocatable{resource="memory"}) > 1) - 1) | |
/ | |
count(kube_node_status_allocatable{resource="memory"}) | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeCPUQuotaOvercommit | |
annotations: | |
description: Cluster has overcommitted CPU resource requests for Namespaces. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit | |
summary: Cluster has overcommitted CPU resource requests. | |
expr: | | |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) | |
/ | |
sum(kube_node_status_allocatable{resource="cpu"}) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeMemoryQuotaOvercommit | |
annotations: | |
description: Cluster has overcommitted memory resource requests for Namespaces. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit | |
summary: Cluster has overcommitted memory resource requests. | |
expr: | | |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) | |
/ | |
sum(kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeQuotaAlmostFull | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull | |
summary: Namespace quota is going to be full. | |
expr: | | |
kube_resourcequota{job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) | |
> 0.9 < 1 | |
for: 15m | |
labels: | |
severity: info | |
namespace: monitoring | |
- alert: KubeQuotaFullyUsed | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused | |
summary: Namespace quota is fully used. | |
expr: | | |
kube_resourcequota{job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) | |
== 1 | |
for: 15m | |
labels: | |
severity: info | |
namespace: monitoring | |
- alert: KubeQuotaExceeded | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded | |
summary: Namespace quota has exceeded the limits. | |
expr: | | |
kube_resourcequota{job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) | |
> 1 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: CPUThrottlingHigh | |
annotations: | |
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace | |
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ | |
$labels.pod }}.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh | |
summary: Processes experience elevated CPU throttling. | |
expr: | | |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) | |
/ | |
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) | |
> ( 25 / 100 ) | |
for: 15m | |
labels: | |
severity: info | |
namespace: monitoring | |
- name: kubernetes-storage | |
rules: | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage | |
}} free. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup | |
summary: PersistentVolume is filling up. | |
expr: | | |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
< 0.03 | |
for: 1m | |
labels: | |
severity: critical | |
namespace: monitoring | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
description: Based on recent sampling, the PersistentVolume claimed by {{ | |
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is | |
expected to fill up within four days. Currently {{ $value | humanizePercentage | |
}} is available. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup | |
summary: PersistentVolume is filling up. | |
expr: | | |
( | |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} | |
) < 0.15 | |
and | |
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 | |
for: 1h | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubePersistentVolumeErrors | |
annotations: | |
description: The persistent volume {{ $labels.persistentvolume }} has status | |
{{ $labels.phase }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors | |
summary: PersistentVolume is having issues with provisioning. | |
expr: | | |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 | |
for: 5m | |
labels: | |
severity: critical | |
namespace: monitoring | |
- name: kubernetes-system | |
rules: | |
- alert: KubeVersionMismatch | |
annotations: | |
description: There are {{ $value }} different semantic versions of Kubernetes | |
components running. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch | |
summary: Different semantic versions of Kubernetes components running. | |
expr: | | |
count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeClientErrors | |
annotations: | |
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ $value | humanizePercentage }} errors.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors | |
summary: Kubernetes API server client is experiencing errors. | |
expr: | | |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) | |
/ | |
sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
> 0.01 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- name: kube-apiserver-slos | |
rules: | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) | |
and | |
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) | |
for: 2m | |
labels: | |
long: 1h | |
severity: critical | |
namespace: monitoring | |
short: 5m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) | |
for: 15m | |
labels: | |
long: 6h | |
severity: critical | |
namespace: monitoring | |
short: 30m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) | |
for: 1h | |
labels: | |
long: 1d | |
severity: warning | |
namespace: monitoring | |
short: 2h | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) | |
for: 3h | |
labels: | |
long: 3d | |
severity: warning | |
namespace: monitoring | |
short: 6h | |
- name: kubernetes-system-apiserver | |
rules: | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
description: A client certificate used to authenticate to the apiserver is | |
expiring in less than 7.0 days. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration | |
summary: Client certificate is about to expire. | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
description: A client certificate used to authenticate to the apiserver is | |
expiring in less than 24.0 hours. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration | |
summary: Client certificate is about to expire. | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 | |
labels: | |
severity: critical | |
namespace: monitoring | |
- alert: AggregatedAPIErrors | |
annotations: | |
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} | |
has reported errors. It has appeared unavailable {{ $value | humanize }} | |
times averaged over the past 10m. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors | |
summary: An aggregated API has reported errors. | |
expr: | | |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: AggregatedAPIDown | |
annotations: | |
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} | |
has been only {{ $value | humanize }}% available over the last 10m. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown | |
summary: An aggregated API is down. | |
expr: | | |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeAPIDown | |
annotations: | |
description: KubeAPI has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="apiserver"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
namespace: monitoring | |
- alert: KubeAPITerminatedRequests | |
annotations: | |
description: The apiserver has terminated {{ $value | humanizePercentage }} | |
of its incoming requests. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests | |
summary: The apiserver has terminated {{ $value | humanizePercentage }} of | |
its incoming requests. | |
expr: | | |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- name: kubernetes-system-kubelet | |
rules: | |
- alert: KubeNodeNotReady | |
annotations: | |
description: '{{ $labels.node }} has been unready for more than 15 minutes.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready | |
summary: Node is not ready. | |
expr: | | |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeNodeUnreachable | |
annotations: | |
description: '{{ $labels.node }} is unreachable and some workloads may be | |
rescheduled.' | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable | |
summary: Node is unreachable. | |
expr: | | |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletTooManyPods | |
annotations: | |
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage | |
}} of its Pod capacity. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods | |
summary: Kubelet is running at capacity. | |
expr: | | |
count by(node) ( | |
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) | |
) | |
/ | |
max by(node) ( | |
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 | |
) > 0.95 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeNodeReadinessFlapping | |
annotations: | |
description: The readiness status of node {{ $labels.node }} has changed {{ | |
$value }} times in the last 15 minutes. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping | |
summary: Node readiness status is flapping. | |
expr: | | |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletPlegDurationHigh | |
annotations: | |
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile | |
duration of {{ $value }} seconds on node {{ $labels.node }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh | |
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. | |
expr: | | |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 | |
for: 5m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletPodStartUpLatencyHigh | |
annotations: | |
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds | |
on node {{ $labels.node }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh | |
summary: Kubelet Pod startup latency is too high. | |
expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletClientCertificateExpiration | |
annotations: | |
description: Client certificate for Kubelet on node {{ $labels.node }} expires | |
in {{ $value | humanizeDuration }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration | |
summary: Kubelet client certificate is about to expire. | |
expr: | | |
kubelet_certificate_manager_client_ttl_seconds < 604800 | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletClientCertificateExpiration | |
annotations: | |
description: Client certificate for Kubelet on node {{ $labels.node }} expires | |
in {{ $value | humanizeDuration }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration | |
summary: Kubelet client certificate is about to expire. | |
expr: | | |
kubelet_certificate_manager_client_ttl_seconds < 86400 | |
labels: | |
severity: critical | |
namespace: monitoring | |
- alert: KubeletServerCertificateExpiration | |
annotations: | |
description: Server certificate for Kubelet on node {{ $labels.node }} expires | |
in {{ $value | humanizeDuration }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration | |
summary: Kubelet server certificate is about to expire. | |
expr: | | |
kubelet_certificate_manager_server_ttl_seconds < 604800 | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletServerCertificateExpiration | |
annotations: | |
description: Server certificate for Kubelet on node {{ $labels.node }} expires | |
in {{ $value | humanizeDuration }}. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration | |
summary: Kubelet server certificate is about to expire. | |
expr: | | |
kubelet_certificate_manager_server_ttl_seconds < 86400 | |
labels: | |
severity: critical | |
namespace: monitoring | |
- alert: KubeletClientCertificateRenewalErrors | |
annotations: | |
description: Kubelet on node {{ $labels.node }} has failed to renew its client | |
certificate ({{ $value | humanize }} errors in the last 5 minutes). | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors | |
summary: Kubelet has failed to renew its client certificate. | |
expr: | | |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletServerCertificateRenewalErrors | |
annotations: | |
description: Kubelet on node {{ $labels.node }} has failed to renew its server | |
certificate ({{ $value | humanize }} errors in the last 5 minutes). | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors | |
summary: Kubelet has failed to renew its server certificate. | |
expr: | | |
increase(kubelet_server_expiration_renew_errors[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
namespace: monitoring | |
- alert: KubeletDown | |
annotations: | |
description: Kubelet has disappeared from Prometheus target discovery. | |
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="kubelet", metrics_path="/metrics"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
namespace: monitoring | |
- name: kube-apiserver.rules | |
rules: | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate6h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate6h | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: read | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: write | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- interval: 3m | |
name: kube-apiserver-availability.rules | |
rules: | |
- expr: | | |
1 - ( | |
( | |
# write too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
- | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
) + | |
( | |
# read too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) | |
- | |
( | |
( | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) | |
+ | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
) | |
) + | |
# errors | |
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d) | |
labels: | |
verb: all | |
record: apiserver_request:availability30d | |
- expr: | | |
1 - ( | |
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) | |
- | |
( | |
# too slow | |
( | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) | |
+ | |
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) | |
) | |
+ | |
# errors | |
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d{verb="read"}) | |
labels: | |
verb: read | |
record: apiserver_request:availability30d | |
- expr: | | |
1 - ( | |
( | |
# too slow | |
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) | |
- | |
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) | |
) | |
+ | |
# errors | |
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) | |
) | |
/ | |
sum(code:apiserver_request_total:increase30d{verb="write"}) | |
labels: | |
verb: write | |
record: apiserver_request:availability30d | |
- expr: | | |
avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 | |
record: code_verb:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[1h])) | |
record: code_verb:apiserver_request_total:increase1h | |
- expr: | | |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) | |
labels: | |
verb: read | |
record: code:apiserver_request_total:increase30d | |
- expr: | | |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) | |
labels: | |
verb: write | |
record: code:apiserver_request_total:increase30d | |
- name: k8s.rules | |
rules: | |
- expr: | | |
sum by (cluster, namespace, pod, container) ( | |
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) | |
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( | |
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_working_set_bytes | |
- expr: | | |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_rss | |
- expr: | | |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_cache | |
- expr: | | |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_swap | |
- expr: | | |
sum by (namespace, cluster) ( | |
sum by (namespace, pod, cluster) ( | |
max by (namespace, pod, container, cluster) ( | |
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} | |
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( | |
kube_pod_status_phase{phase=~"Pending|Running"} == 1 | |
) | |
) | |
) | |
record: namespace_memory:kube_pod_container_resource_requests:sum | |
- expr: | | |
sum by (namespace, cluster) ( | |
sum by (namespace, pod, cluster) ( | |
max by (namespace, pod, container, cluster) ( | |
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} | |
) * on(namespace, pod, cluster) group_left() max by (namespace, pod) ( | |
kube_pod_status_phase{phase=~"Pending|Running"} == 1 | |
) | |
) | |
) | |
record: namespace_cpu:kube_pod_container_resource_requests:sum | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, | |
"replicaset", "$1", "owner_name", "(.*)" | |
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( | |
1, max by (replicaset, namespace, owner_name) ( | |
kube_replicaset_owner{job="kube-state-metrics"} | |
) | |
), | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: deployment | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: daemonset | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: statefulset | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- name: kube-scheduler.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- name: node.rules | |
rules: | |
- expr: | | |
topk by(namespace, pod) (1, | |
max by (node, namespace, pod) ( | |
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") | |
)) | |
record: 'node_namespace_pod:kube_pod_info:' | |
- expr: | | |
count by (cluster, node) (sum by (node, cpu) ( | |
node_cpu_seconds_total{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) | |
)) | |
record: node:node_num_cpu:sum | |
- expr: | | |
sum( | |
node_memory_MemAvailable_bytes{job="node-exporter"} or | |
( | |
node_memory_Buffers_bytes{job="node-exporter"} + | |
node_memory_Cached_bytes{job="node-exporter"} + | |
node_memory_MemFree_bytes{job="node-exporter"} + | |
node_memory_Slab_bytes{job="node-exporter"} | |
) | |
) by (cluster) | |
record: :node_memory_MemAvailable_bytes:sum | |
- name: kubelet.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.99" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.9" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.5" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment