Created
November 15, 2021 15:23
-
-
Save bastjan/fa57becf0bc957c93ae8733562943696 to your computer and use it in GitHub Desktop.
Full diff between OCP 4.8 and 4.9 monitoring rules
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -rub compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml | |
--- compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml 2021-11-15 16:12:31.000000000 +0100 | |
+++ compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml 2021-11-15 16:13:35.000000000 +0100 | |
@@ -156,8 +156,10 @@ | |
syn: 'true' | |
- alert: SYN_SamplesTBRInaccessibleOnBoot | |
annotations: | |
- message: 'Samples operator could not access ''registry.redhat.io'' during | |
- its initial installation and it bootstrapped as removed. | |
+ message: 'One of two situations has occurred. Either | |
+ | |
+ samples operator could not access ''registry.redhat.io'' during its | |
+ initial installation and it bootstrapped as removed. | |
If this is expected, and stems from installing in a restricted network | |
environment, please note that if you | |
@@ -175,7 +177,11 @@ | |
assist the mirroring process. | |
- ' | |
+ Or, the use of allowed registries or blocked registries with global | |
+ imagestream configuration will not allow | |
+ | |
+ samples operator to create imagestreams using the default image registry | |
+ ''registry.redhat.io''.' | |
syn_component: openshift4-monitoring | |
expr: openshift_samples_tbr_inaccessible_info == 1 | |
for: 2d | |
@@ -250,6 +256,7 @@ | |
annotations: | |
description: Configuration has failed to load for {{ $labels.namespace | |
}}/{{ $labels.pod}}. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedReload.md | |
summary: Reloading an Alertmanager configuration has failed. | |
syn_component: openshift4-monitoring | |
expr: '# Without max_over_time, failed scrapes could create false negatives, | |
@@ -301,18 +308,6 @@ | |
rules: [] | |
- name: syn-cluster-machine-approver.rules | |
rules: | |
- - alert: SYN_ClusterMachineApproverDown | |
- annotations: | |
- message: ClusterMachineApprover has disappeared from Prometheus target | |
- discovery. | |
- syn_component: openshift4-monitoring | |
- expr: 'absent(up{job="machine-approver"} == 1) | |
- | |
- ' | |
- for: 10m | |
- labels: | |
- severity: critical | |
- syn: 'true' | |
- alert: SYN_MachineApproverMaxPendingCSRsReached | |
annotations: | |
message: max pending CSRs threshold reached. | |
@@ -328,7 +323,7 @@ | |
rules: | |
- alert: SYN_ClusterProxyApplySlow | |
annotations: | |
- message: The cluster is taking too long, on average, to apply kubernetes | |
+ summary: The cluster is taking too long, on average, to apply kubernetes | |
service rules to iptables. | |
syn_component: openshift4-monitoring | |
expr: 'histogram_quantile(0.95, sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket[5m])) | |
@@ -340,7 +335,7 @@ | |
syn: 'true' | |
- alert: SYN_NodeProxyApplySlow | |
annotations: | |
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
{{"}}"}} is taking too long, on average, to apply kubernetes service | |
rules to iptables. | |
syn_component: openshift4-monitoring | |
@@ -352,7 +347,7 @@ | |
syn: 'true' | |
- alert: SYN_NodeProxyApplyStale | |
annotations: | |
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
{{"}}"}} has stale kubernetes service rules in iptables. | |
syn_component: openshift4-monitoring | |
expr: '(kubeproxy_sync_proxy_rules_last_queued_timestamp_seconds - kubeproxy_sync_proxy_rules_last_timestamp_seconds) | |
@@ -368,10 +363,8 @@ | |
syn: 'true' | |
- alert: SYN_NodeWithoutSDNPod | |
annotations: | |
- message: 'All nodes should be running an sdn pod, {{"{{"}} $labels.node | |
+ summary: All nodes should be running an sdn pod, {{"{{"}} $labels.node | |
{{"}}"}} is not. | |
- | |
- ' | |
syn_component: openshift4-monitoring | |
expr: '(kube_node_info unless on(node) topk by (node) (1, kube_pod_info{namespace="openshift-sdn", pod=~"sdn.*"})) | |
> 0 | |
@@ -383,7 +376,7 @@ | |
syn: 'true' | |
- alert: SYN_SDNPodNotReady | |
annotations: | |
- message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
+ summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node | |
{{"}}"}} is not ready. | |
syn_component: openshift4-monitoring | |
expr: 'kube_pod_status_ready{namespace=''openshift-sdn'', condition=''true''} | |
@@ -398,16 +391,18 @@ | |
rules: | |
- alert: SYN_ClusterNotUpgradeable | |
annotations: | |
- message: One or more cluster operators have been blocking minor version | |
- cluster upgrades for at least an hour for reason {{ with $cluster_operator_conditions | |
- := "cluster_operator_conditions" | query}}{{range $value := .}}{{if | |
- and (eq (label "name" $value) "version") (eq (label "condition" $value) | |
- "Upgradeable") (eq (label "endpoint" $value) "metrics") (eq (value $value) | |
- 0.0) (ne (len (label "reason" $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}} | |
- {{ with $console_url := "console_url" | query }}{{ if ne (len (label | |
- "url" (first $console_url ) ) ) 0}} For more information refer to {{ | |
- label "url" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end | |
- }} | |
+ description: In most cases, you will still be able to apply patch releases. | |
+ Reason {{ with $cluster_operator_conditions := "cluster_operator_conditions" | |
+ | query}}{{range $value := .}}{{if and (eq (label "name" $value) "version") | |
+ (eq (label "condition" $value) "Upgradeable") (eq (label "endpoint" | |
+ $value) "metrics") (eq (value $value) 0.0) (ne (len (label "reason" | |
+ $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}} For more | |
+ information refer to 'oc adm upgrade'{{ with $console_url := "console_url" | |
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or | |
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ | |
+ end }}. | |
+ summary: One or more cluster operators have been blocking minor version | |
+ cluster upgrades for at least an hour. | |
syn_component: openshift4-monitoring | |
expr: 'max by (name, condition, endpoint) (cluster_operator_conditions{name="version", | |
condition="Upgradeable", endpoint="metrics"} == 0) | |
@@ -419,9 +414,14 @@ | |
syn: 'true' | |
- alert: SYN_ClusterOperatorDegraded | |
annotations: | |
- message: Cluster operator {{ $labels.name }} has been degraded for 30 | |
- minutes. Operator is degraded because {{ $labels.reason }} and cluster | |
- upgrades will be unstable. | |
+ description: The {{ $labels.name }} operator is degraded because {{ $labels.reason | |
+ }}, and the components it manages may have reduced quality of service. Cluster | |
+ upgrades may not complete. For more information refer to 'oc get -o | |
+ yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | |
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or | |
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ | |
+ end }}. | |
+ summary: Cluster operator has been degraded for 30 minutes. | |
syn_component: openshift4-monitoring | |
expr: "(\n cluster_operator_conditions{job=\"cluster-version-operator\"\ | |
, condition=\"Degraded\"}\n or on (name)\n group by (name) (cluster_operator_up{job=\"\ | |
@@ -432,9 +432,14 @@ | |
syn: 'true' | |
- alert: SYN_ClusterOperatorDown | |
annotations: | |
- message: Cluster operator {{ $labels.name }} has not been available for | |
- 10 minutes. Operator may be down or disabled, cluster will not be kept | |
- up to date and upgrades will not be possible. | |
+ description: The {{ $labels.name }} operator may be down or disabled, | |
+ and the components it manages may be unavailable or degraded. Cluster | |
+ upgrades may not complete. For more information refer to 'oc get -o | |
+ yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url" | |
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or | |
+ {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{ | |
+ end }}. | |
+ summary: Cluster operator has not been available for 10 minutes. | |
syn_component: openshift4-monitoring | |
expr: 'cluster_operator_up{job="cluster-version-operator"} == 0 | |
@@ -445,8 +450,12 @@ | |
syn: 'true' | |
- alert: SYN_ClusterOperatorFlapping | |
annotations: | |
- message: Cluster operator {{ $labels.name }} up status is changing often. | |
- This might cause upgrades to be unstable. | |
+ description: The {{ $labels.name }} operator behavior might cause upgrades | |
+ to be unstable. For more information refer to 'oc get -o yaml clusteroperator | |
+ {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{ | |
+ if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label "url" | |
+ (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}. | |
+ summary: Cluster operator up status is changing often. | |
syn_component: openshift4-monitoring | |
expr: 'changes(cluster_operator_up{job="cluster-version-operator"}[2m]) | |
> 2 | |
@@ -460,8 +469,11 @@ | |
rules: | |
- alert: SYN_CannotRetrieveUpdates | |
annotations: | |
- message: Cluster version operator has not retrieved updates in {{ $value | |
- | humanizeDuration }}. Failure reason {{ with $cluster_operator_conditions | |
+ description: Failure to retrieve updates means that cluster administrators | |
+ will need to monitor for available updates on their own or risk falling | |
+ behind on security or other bugfixes. If the failure is expected, you | |
+ can clear spec.channel in the ClusterVersion object to tell the cluster-version | |
+ operator to not retrieve updates. Failure reason {{ with $cluster_operator_conditions | |
:= "cluster_operator_conditions" | query}}{{range $value := .}}{{if | |
and (eq (label "name" $value) "version") (eq (label "condition" $value) | |
"RetrievedUpdates") (eq (label "endpoint" $value) "metrics") (eq (value | |
@@ -469,6 +481,8 @@ | |
$console_url := "console_url" | query }}{{ if ne (len (label "url" (first | |
$console_url ) ) ) 0}} For more information refer to {{ label "url" | |
(first $console_url ) }}/settings/cluster/.{{ end }}{{ end }} | |
+ summary: Cluster version operator has not retrieved updates in {{ $value | |
+ | humanizeDuration }}. | |
syn_component: openshift4-monitoring | |
expr: '(time()-cluster_version_operator_update_retrieval_timestamp_seconds) | |
>= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version", | |
@@ -480,9 +494,15 @@ | |
syn: 'true' | |
- alert: SYN_ClusterVersionOperatorDown | |
annotations: | |
- message: Cluster version operator has disappeared from Prometheus target | |
- discovery. Operator may be down or disabled, cluster will not be kept | |
- up to date and upgrades will not be possible. | |
+ description: The operator may be down or disabled. The cluster will not | |
+ be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version | |
+ namespace for events or changes to the cluster-version-operator deployment | |
+ or pods to diagnose and repair. {{ with $console_url := "console_url" | |
+ | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} For | |
+ more information refer to {{ label "url" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{ | |
+ end }}{{ end }} | |
+ summary: Cluster version operator has disappeared from Prometheus target | |
+ discovery. | |
syn_component: openshift4-monitoring | |
expr: 'absent(up{job="cluster-version-operator"} == 1) | |
@@ -554,13 +574,14 @@ | |
syn: 'true' | |
- alert: SYN_UpdateAvailable | |
annotations: | |
- message: Your upstream update recommendation service recommends you update | |
- your cluster. For more information refer to 'oc adm upgrade'{{ with | |
- $console_url := "console_url" | query }}{{ if ne (len (label "url" (first | |
- $console_url ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ | |
+ description: For more information refer to 'oc adm upgrade'{{ with $console_url | |
+ := "console_url" | query }}{{ if ne (len (label "url" (first $console_url | |
+ ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{ | |
end }}{{ end }}. | |
+ summary: Your upstream update recommendation service recommends you update | |
+ your cluster. | |
syn_component: openshift4-monitoring | |
- expr: 'cluster_version_available_updates > 0 | |
+ expr: 'sum by (channel,upstream) (cluster_version_available_updates) > 0 | |
' | |
labels: | |
@@ -582,13 +603,14 @@ | |
more CPU pressure is likely to cause a failover; increase available | |
CPU. | |
syn_component: openshift4-monitoring | |
- expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) | |
+ expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) | |
* 100) > 90 AND on (instance) label_replace( kube_node_role{role="master"}, | |
"instance", "$1", "node", "(.+)" ) | |
' | |
for: 5m | |
labels: | |
+ namespace: openshift-kube-apiserver | |
severity: critical | |
syn: 'true' | |
- alert: SYN_HighOverallControlPlaneCPU | |
@@ -605,11 +627,12 @@ | |
outage may cause a cascading failure; increase available CPU. | |
syn_component: openshift4-monitoring | |
expr: "sum(\n 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"\ | |
- idle\"}[5m])) * 100)\n AND on (instance) label_replace( kube_node_role{role=\"\ | |
+ idle\"}[1m])) * 100)\n AND on (instance) label_replace( kube_node_role{role=\"\ | |
master\"}, \"instance\", \"$1\", \"node\", \"(.+)\" )\n)\n/\ncount(kube_node_role{role=\"\ | |
master\"})\n> 60\n" | |
for: 10m | |
labels: | |
+ namespace: openshift-kube-apiserver | |
severity: warning | |
syn: 'true' | |
- name: syn-etcd | |
@@ -791,11 +814,12 @@ | |
syn_component: openshift4-monitoring | |
expr: vector(1) | |
labels: | |
+ namespace: openshift-monitoring | |
severity: none | |
syn: 'true' | |
- name: syn-k8s.rules | |
rules: [] | |
- - name: syn-kube-apiserver-slos | |
+ - name: syn-kube-apiserver-slos-basic | |
rules: | |
- alert: SYN_KubeAPIErrorBudgetBurn | |
annotations: | |
@@ -816,6 +840,7 @@ | |
for: 2m | |
labels: | |
long: 1h | |
+ namespace: openshift-kube-apiserver | |
severity: critical | |
short: 5m | |
syn: 'true' | |
@@ -838,53 +863,10 @@ | |
for: 15m | |
labels: | |
long: 6h | |
+ namespace: openshift-kube-apiserver | |
severity: critical | |
short: 30m | |
syn: 'true' | |
- - alert: SYN_KubeAPIErrorBudgetBurn | |
- annotations: | |
- description: The API server is burning too much error budget. This alert | |
- fires when too many requests are failing with high latency. Use the | |
- 'API Performance' monitoring dashboards to narrow down the request states | |
- and latency. The 'etcd' monitoring dashboards also provides metrics | |
- to help determine etcd stability and performance. | |
- summary: The API server is burning too much error budget. | |
- syn_component: openshift4-monitoring | |
- expr: 'sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) | |
- | |
- and | |
- | |
- sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) | |
- | |
- ' | |
- for: 1h | |
- labels: | |
- long: 1d | |
- severity: warning | |
- short: 2h | |
- syn: 'true' | |
- - alert: SYN_KubeAPIErrorBudgetBurn | |
- annotations: | |
- description: The API server is burning too much error budget. This alert | |
- fires when too many requests are failing with high latency. Use the | |
- 'API Performance' monitoring dashboards to narrow down the request states | |
- and latency. The 'etcd' monitoring dashboards also provides metrics | |
- to help determine etcd stability and performance. | |
- summary: The API server is burning too much error budget. | |
- syn_component: openshift4-monitoring | |
- expr: 'sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) | |
- | |
- and | |
- | |
- sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) | |
- | |
- ' | |
- for: 3h | |
- labels: | |
- long: 3d | |
- severity: warning | |
- short: 6h | |
- syn: 'true' | |
- name: syn-kube-apiserver.rules | |
rules: [] | |
- name: syn-kube-prometheus-general.rules | |
@@ -933,7 +915,7 @@ | |
$labels.container}} has been in waiting state for longer than 1 hour. | |
summary: Pod container waiting longer than 1 hour | |
syn_component: openshift4-monitoring | |
- expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}) | |
+ expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}) | |
> 0 | |
' | |
@@ -947,7 +929,7 @@ | |
$labels.daemonset }} are running where they are not supposed to run.' | |
summary: DaemonSet pods are misscheduled. | |
syn_component: openshift4-monitoring | |
- expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
+ expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} | |
> 0 | |
' | |
@@ -961,10 +943,9 @@ | |
$labels.daemonset }} are not scheduled.' | |
summary: DaemonSet pods are not scheduled. | |
syn_component: openshift4-monitoring | |
- expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"} >\ | |
- \ 0\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"} > 0\n" | |
for: 10m | |
labels: | |
severity: warning | |
@@ -976,20 +957,18 @@ | |
summary: DaemonSet rollout is stuck. | |
syn_component: openshift4-monitoring | |
expr: "(\n (\n kube_daemonset_status_current_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ ) or (\n kube_daemonset_status_number_misscheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ !=\n 0\n ) or (\n kube_daemonset_updated_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ ) or (\n kube_daemonset_status_number_available{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"}\n !=\n kube_daemonset_status_desired_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ )\n) and (\n changes(kube_daemonset_updated_number_scheduled{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\ | |
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_misscheduled{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\ | |
+ \ 0\n ) or (\n kube_daemonset_updated_number_scheduled{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\ | |
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n ) or (\n kube_daemonset_status_number_available{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\ | |
+ \ kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_daemonset_updated_number_scheduled{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n \ | |
\ ==\n 0\n)\n" | |
for: 30m | |
labels: | |
@@ -1002,46 +981,44 @@ | |
has not been rolled back. | |
summary: Deployment generation mismatch due to possible roll-back | |
syn_component: openshift4-monitoring | |
- expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n" | |
for: 15m | |
labels: | |
severity: warning | |
syn: 'true' | |
- alert: SYN_KubeHpaMaxedOut | |
annotations: | |
- description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running | |
- at max replicas for longer than 15 minutes. | |
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} | |
+ has been running at max replicas for longer than 15 minutes. | |
summary: HPA is running at max replicas | |
syn_component: openshift4-monitoring | |
expr: "kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"}\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n" | |
for: 15m | |
labels: | |
severity: warning | |
syn: 'true' | |
- alert: SYN_KubeHpaReplicasMismatch | |
annotations: | |
- description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched | |
- the desired number of replicas for longer than 15 minutes. | |
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} | |
+ has not matched the desired number of replicas for longer than 15 minutes. | |
summary: HPA has not matched descired number of replicas. | |
syn_component: openshift4-monitoring | |
expr: "(kube_horizontalpodautoscaler_status_desired_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"})\n\ | |
- \ and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"})\n and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"})\n and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[15m])\ | |
- \ == 0\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\ | |
+ (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\ | |
+ (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n and\n\ | |
+ changes(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[15m]) ==\ | |
+ \ 0\n" | |
for: 15m | |
labels: | |
severity: warning | |
@@ -1052,8 +1029,8 @@ | |
more than 12 hours to complete. | |
summary: Job did not complete in time | |
syn_component: openshift4-monitoring | |
- expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
- - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > | |
+ expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} | |
+ - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} > | |
0 | |
' | |
@@ -1066,9 +1043,10 @@ | |
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed | |
to complete. Removing failed job after investigation should clear this | |
alert. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeJobFailed.md | |
summary: Job failed to complete. | |
syn_component: openshift4-monitoring | |
- expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > | |
+ expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} > | |
0 | |
' | |
@@ -1078,12 +1056,13 @@ | |
syn: 'true' | |
- alert: SYN_KubePodCrashLooping | |
annotations: | |
- description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
- }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. | |
+ description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
+ }}) is in waiting state (reason: "CrashLoopBackOff").' | |
summary: Pod is crash looping. | |
syn_component: openshift4-monitoring | |
- expr: 'rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[10m]) | |
- * 60 * 5 > 0 | |
+ expr: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", | |
+ namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}[5m]) | |
+ >= 1 | |
' | |
for: 15m | |
@@ -1094,10 +1073,11 @@ | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in | |
a non-ready state for longer than 15 minutes. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md | |
summary: Pod has been in a non-ready state for more than 15 minutes. | |
syn_component: openshift4-monitoring | |
expr: "sum by (namespace, pod) (\n max by(namespace, pod) (\n kube_pod_status_phase{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\", phase=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\", phase=~\"\ | |
Pending|Unknown\"}\n ) * on(namespace, pod) group_left(owner_kind) topk\ | |
\ by(namespace, pod) (\n 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"\ | |
Job\"})\n )\n) > 0\n" | |
@@ -1112,9 +1092,9 @@ | |
has not been rolled back. | |
summary: StatefulSet generation mismatch due to possible roll-back | |
syn_component: openshift4-monitoring | |
- expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n" | |
for: 15m | |
labels: | |
severity: warning | |
@@ -1126,12 +1106,11 @@ | |
minutes. | |
summary: Deployment has not matched the expected number of replicas. | |
syn_component: openshift4-monitoring | |
- expr: "(\n kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "(\n kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- ) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[10m])\n\ | |
- \ ==\n 0\n)\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\ | |
+ \ changes(kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}[10m])\n ==\n 0\n)\n" | |
for: 15m | |
labels: | |
severity: warning | |
@@ -1143,14 +1122,13 @@ | |
summary: StatefulSet update has not been rolled out. | |
syn_component: openshift4-monitoring | |
expr: "(\n max without (revision) (\n kube_statefulset_status_current_revision{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ unless\n kube_statefulset_status_update_revision{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ )\n *\n (\n kube_statefulset_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- \ )\n) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n unless\n\ | |
+ \ kube_statefulset_status_update_revision{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n )\n *\n (\n kube_statefulset_replicas{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n !=\n\ | |
+ \ kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n )\n) and (\n changes(kube_statefulset_status_replicas_updated{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n \ | |
\ ==\n 0\n)\n" | |
for: 15m | |
labels: | |
@@ -1163,15 +1141,21 @@ | |
- alert: SYN_KubeCPUOvercommit | |
annotations: | |
description: Cluster has overcommitted CPU resource requests for Pods | |
- and cannot tolerate node failure. | |
+ by {{ $value }} CPU shares and cannot tolerate node failure. | |
summary: Cluster has overcommitted CPU resource requests. | |
syn_component: openshift4-monitoring | |
- expr: "sum(namespace_cpu:kube_pod_container_resource_requests:sum{})\n \ | |
- \ /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n >\n((count(kube_node_status_allocatable{resource=\"\ | |
- cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"\ | |
- })\n" | |
- for: 5m | |
+ expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) | |
+ - max(kube_node_status_allocatable{resource="cpu"})) > 0 | |
+ | |
+ and | |
+ | |
+ (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) | |
+ > 0 | |
+ | |
+ ' | |
+ for: 10m | |
labels: | |
+ namespace: kube-system | |
severity: warning | |
syn: 'true' | |
- alert: SYN_KubeCPUQuotaOvercommit | |
@@ -1179,7 +1163,7 @@ | |
description: Cluster has overcommitted CPU resource requests for Namespaces. | |
summary: Cluster has overcommitted CPU resource requests. | |
syn_component: openshift4-monitoring | |
- expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\n\ | |
sum(kube_node_status_allocatable{resource=\"cpu\"})\n > 1.5\n" | |
for: 5m | |
@@ -1189,15 +1173,22 @@ | |
- alert: SYN_KubeMemoryOvercommit | |
annotations: | |
description: Cluster has overcommitted memory resource requests for Pods | |
- and cannot tolerate node failure. | |
+ by {{ $value }} bytes and cannot tolerate node failure. | |
summary: Cluster has overcommitted memory resource requests. | |
syn_component: openshift4-monitoring | |
- expr: "sum(namespace_memory:kube_pod_container_resource_requests:sum{})\n\ | |
- \ /\nsum(kube_node_status_allocatable{resource=\"memory\"})\n >\n((count(kube_node_status_allocatable{resource=\"\ | |
- memory\"}) > 1) - 1)\n /\ncount(kube_node_status_allocatable{resource=\"\ | |
- memory\"})\n" | |
- for: 5m | |
+ expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{}) | |
+ - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) | |
+ > 0 | |
+ | |
+ and | |
+ | |
+ (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) | |
+ > 0 | |
+ | |
+ ' | |
+ for: 10m | |
labels: | |
+ namespace: kube-system | |
severity: warning | |
syn: 'true' | |
- alert: SYN_KubeMemoryQuotaOvercommit | |
@@ -1205,7 +1196,7 @@ | |
description: Cluster has overcommitted memory resource requests for Namespaces. | |
summary: Cluster has overcommitted memory resource requests. | |
syn_component: openshift4-monitoring | |
- expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\n\ | |
sum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"\ | |
})\n > 1.5\n" | |
@@ -1219,9 +1210,9 @@ | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota is going to be full. | |
syn_component: openshift4-monitoring | |
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\ | |
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.9 < 1\n" | |
for: 15m | |
labels: | |
@@ -1233,9 +1224,9 @@ | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota has exceeded the limits. | |
syn_component: openshift4-monitoring | |
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\ | |
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 1\n" | |
for: 15m | |
labels: | |
@@ -1247,9 +1238,9 @@ | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota is fully used. | |
syn_component: openshift4-monitoring | |
- expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job,\ | |
- \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n == 1\n" | |
for: 15m | |
labels: | |
@@ -1263,7 +1254,7 @@ | |
status {{ $labels.phase }}. | |
summary: PersistentVolume is having issues with provisioning. | |
syn_component: openshift4-monitoring | |
- expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
+ expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"} | |
> 0 | |
' | |
@@ -1276,14 +1267,14 @@ | |
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage | |
}} free. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md | |
summary: PersistentVolume is filling up. | |
syn_component: openshift4-monitoring | |
- expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\ | |
- /metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\ | |
- /metrics\"} > 0\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\ | |
+ }\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\n" | |
for: 1m | |
labels: | |
severity: critical | |
@@ -1294,16 +1285,16 @@ | |
{{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace | |
}} is expected to fill up within four days. Currently {{ $value | humanizePercentage | |
}} is available. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md | |
summary: PersistentVolume is filling up. | |
syn_component: openshift4-monitoring | |
- expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
+ expr: "(\n kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
,job=\"kubelet\", metrics_path=\"/metrics\"}\n /\n kubelet_volume_stats_capacity_bytes{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\ | |
- /metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\ | |
- /metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\ | |
- /metrics\"}[6h], 4 * 24 * 3600) < 0\n" | |
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\ | |
+ }\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\ | |
+ }[6h], 4 * 24 * 3600) < 0\n" | |
for: 1h | |
labels: | |
severity: warning | |
@@ -1317,8 +1308,8 @@ | |
summary: Kubernetes API server client is experiencing errors. | |
syn_component: openshift4-monitoring | |
expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance,\ | |
- \ job)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job))\n\ | |
- > 0.01\n" | |
+ \ job, namespace)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance,\ | |
+ \ job, namespace))\n> 0.01\n" | |
for: 15m | |
labels: | |
severity: warning | |
@@ -1332,7 +1323,7 @@ | |
summary: An aggregated API is down. | |
syn_component: openshift4-monitoring | |
expr: '(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) | |
- * 100 < 70 | |
+ * 100 < 85 | |
' | |
for: 5m | |
@@ -1356,6 +1347,7 @@ | |
- alert: SYN_KubeAPIDown | |
annotations: | |
description: KubeAPI has disappeared from Prometheus target discovery. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md | |
summary: Target disappeared from Prometheus target discovery. | |
syn_component: openshift4-monitoring | |
expr: 'absent(up{job="apiserver"} == 1) | |
@@ -1386,6 +1378,7 @@ | |
- alert: SYN_KubeNodeNotReady | |
annotations: | |
description: '{{ $labels.node }} has been unready for more than 15 minutes.' | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeNodeNotReady.md | |
summary: Node is not ready. | |
syn_component: openshift4-monitoring | |
expr: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} | |
@@ -1442,6 +1435,7 @@ | |
- alert: SYN_KubeletDown | |
annotations: | |
description: Kubelet has disappeared from Prometheus target discovery. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeletDown.md | |
summary: Target disappeared from Prometheus target discovery. | |
syn_component: openshift4-monitoring | |
expr: 'absent(up{job="kubelet", metrics_path="/metrics"} == 1) | |
@@ -1449,6 +1443,7 @@ | |
' | |
for: 15m | |
labels: | |
+ namespace: kube-system | |
severity: critical | |
syn: 'true' | |
- alert: SYN_KubeletPlegDurationHigh | |
@@ -1650,6 +1645,21 @@ | |
labels: | |
severity: critical | |
syn: 'true' | |
+ - name: syn-machine-health-check-unterminated-short-circuit | |
+ rules: | |
+ - alert: SYN_MachineHealthCheckUnterminatedShortCircuit | |
+ annotation: | |
+ message: machine health check {{ $labels.name }} has been disabled by | |
+ short circuit for more than 30 minutes | |
+ annotations: | |
+ syn_component: openshift4-monitoring | |
+ expr: 'mapi_machinehealthcheck_short_circuit == 1 | |
+ | |
+ ' | |
+ for: 30m | |
+ labels: | |
+ severity: warning | |
+ syn: 'true' | |
- name: syn-machine-not-yet-deleted | |
rules: | |
- alert: SYN_MachineNotYetDeleted | |
@@ -1692,6 +1702,27 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
+ - name: syn-master-nodes-high-memory-usage | |
+ rules: | |
+ - alert: SYN_MasterNodesHighMemoryUsage | |
+ annotations: | |
+ message: Memory usage of {{ $value | humanize }} on {{ $labels.node }} | |
+ exceeds 90%. Master nodes starved of memory could result in degraded | |
+ performance of the control plane. | |
+ syn_component: openshift4-monitoring | |
+ expr: '((sum(node_memory_MemTotal_bytes AND on (instance) label_replace( | |
+ kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) - sum(node_memory_MemFree_bytes | |
+ + node_memory_Buffers_bytes + node_memory_Cached_bytes AND on (instance) | |
+ label_replace( kube_node_role{role="master"}, "instance", "$1", "node", | |
+ "(.+)" ))) / sum(node_memory_MemTotal_bytes AND on (instance) label_replace( | |
+ kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) * 100) | |
+ > 90 | |
+ | |
+ ' | |
+ for: 15m | |
+ labels: | |
+ severity: warning | |
+ syn: 'true' | |
- name: syn-mcd-drain-error | |
rules: | |
- alert: SYN_MCDDrainError | |
@@ -1774,10 +1805,37 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
+ - alert: SYN_NodeFileDescriptorLimit | |
+ annotations: | |
+ description: File descriptors limit at {{ $labels.instance }} is currently | |
+ at {{ printf "%.2f" $value }}%. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md | |
+ summary: Kernel is predicted to exhaust file descriptors limit soon. | |
+ syn_component: openshift4-monitoring | |
+ expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\ | |
+ node-exporter\"} > 70\n)\n" | |
+ for: 15m | |
+ labels: | |
+ severity: warning | |
+ syn: 'true' | |
+ - alert: SYN_NodeFileDescriptorLimit | |
+ annotations: | |
+ description: File descriptors limit at {{ $labels.instance }} is currently | |
+ at {{ printf "%.2f" $value }}%. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md | |
+ summary: Kernel is predicted to exhaust file descriptors limit soon. | |
+ syn_component: openshift4-monitoring | |
+ expr: "(\n node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\ | |
+ node-exporter\"} > 90\n)\n" | |
+ for: 15m | |
+ labels: | |
+ severity: critical | |
+ syn: 'true' | |
- alert: SYN_NodeFilesystemAlmostOutOfFiles | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available inodes left. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md | |
summary: Filesystem has less than 5% inodes left. | |
syn_component: openshift4-monitoring | |
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\ | |
@@ -1792,6 +1850,7 @@ | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available inodes left. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md | |
summary: Filesystem has less than 3% inodes left. | |
syn_component: openshift4-monitoring | |
expr: "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\ | |
@@ -1806,13 +1865,14 @@ | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available space left. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md | |
summary: Filesystem has less than 5% space left. | |
syn_component: openshift4-monitoring | |
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\ | |
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\ | |
\ 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ | |
\"} == 0\n)\n" | |
- for: 1h | |
+ for: 30m | |
labels: | |
severity: warning | |
syn: 'true' | |
@@ -1820,13 +1880,14 @@ | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available space left. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md | |
summary: Filesystem has less than 3% space left. | |
syn_component: openshift4-monitoring | |
expr: "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\ | |
\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\ | |
\ 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\ | |
\"} == 0\n)\n" | |
- for: 1h | |
+ for: 30m | |
labels: | |
severity: critical | |
syn: 'true' | |
@@ -1835,6 +1896,7 @@ | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available inodes left and is | |
filling up. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md | |
summary: Filesystem is predicted to run out of inodes within the next | |
24 hours. | |
syn_component: openshift4-monitoring | |
@@ -1852,6 +1914,7 @@ | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available inodes left and is | |
filling up fast. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md | |
summary: Filesystem is predicted to run out of inodes within the next | |
4 hours. | |
syn_component: openshift4-monitoring | |
@@ -1869,6 +1932,7 @@ | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available space left and is | |
filling up. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md | |
summary: Filesystem is predicted to run out of space within the next 24 | |
hours. | |
syn_component: openshift4-monitoring | |
@@ -1886,6 +1950,7 @@ | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance | |
}} has only {{ printf "%.2f" $value }}% available space left and is | |
filling up fast. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md | |
summary: Filesystem is predicted to run out of space within the next 4 | |
hours. | |
syn_component: openshift4-monitoring | |
@@ -1945,6 +2010,7 @@ | |
description: RAID array '{{ $labels.device }}' on {{ $labels.instance | |
}} is in degraded state due to one or more disks failures. Number of | |
spare drives is insufficient to fix issue automatically. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeRAIDDegraded.md | |
summary: RAID Array is degraded | |
syn_component: openshift4-monitoring | |
expr: 'node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) | |
@@ -1985,8 +2051,10 @@ | |
rules: | |
- alert: SYN_NodeNetworkInterfaceFlapping | |
annotations: | |
- message: Network interface "{{ $labels.device }}" changing it's up status | |
- often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} | |
+ description: Network interface "{{ $labels.device }}" changing its up | |
+ status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod | |
+ }} | |
+ summary: Network interface is often changing its status | |
syn_component: openshift4-monitoring | |
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) | |
> 2 | |
@@ -2091,12 +2159,12 @@ | |
syn_component: openshift4-monitoring | |
expr: "count without (node)\n(\n group by (node, workload, namespace)\n\ | |
\ (\n kube_pod_info{node!=\"\"}\n * on(namespace,pod) group_left(workload)\n\ | |
- \ (\n kube_pod_spec_volumes_persistentvolumeclaims_info\n \ | |
+ \ (\n max by(namespace, pod, workload) (kube_pod_spec_volumes_persistentvolumeclaims_info)\n\ | |
\ * on(namespace,pod) group_left(workload)\n (\n namespace_workload_pod:kube_pod_owner:relabel\n\ | |
\ * on(namespace,workload,workload_type) group_left()\n \ | |
\ (\n count without(pod) (namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\"}) > 1\n )\n )\n \ | |
- \ )\n )\n) == 1\n" | |
+ (openshift-.*|kube-.*|default)\"}) > 1\n )\n )\n )\n )\n\ | |
+ ) == 1\n" | |
for: 1h | |
labels: | |
severity: warning | |
@@ -2174,21 +2242,24 @@ | |
rules: | |
- alert: SYN_AlertmanagerReceiversNotConfigured | |
annotations: | |
- message: Alerts are not configured to be sent to a notification system, | |
+ description: Alerts are not configured to be sent to a notification system, | |
meaning that you may not be notified in a timely fashion when important | |
failures occur. Check the OpenShift documentation to learn how to configure | |
notifications with Alertmanager. | |
+ summary: Receivers (notification integrations) are not configured on Alertmanager | |
syn_component: openshift4-monitoring | |
- expr: cluster:alertmanager_routing_enabled:max == 0 | |
+ expr: cluster:alertmanager_integrations:max == 0 | |
for: 10m | |
labels: | |
+ namespace: openshift-monitoring | |
severity: warning | |
syn: 'true' | |
- alert: SYN_ClusterMonitoringOperatorReconciliationErrors | |
annotations: | |
- message: Cluster Monitoring Operator is experiencing unexpected reconciliation | |
- errors. Inspect the cluster-monitoring-operator log for potential root | |
- causes. | |
+ description: Errors are occurring during reconciliation cycles. Inspect | |
+ the cluster-monitoring-operator log for potential root causes. | |
+ summary: Cluster Monitoring Operator is experiencing unexpected reconciliation | |
+ errors. | |
syn_component: openshift4-monitoring | |
expr: max_over_time(cluster_monitoring_operator_last_reconciliation_successful[5m]) | |
== 0 | |
@@ -2207,28 +2278,32 @@ | |
this may indicate a new version of a cluster component cannot start | |
due to a bug or configuration error. Assess the pods for this deployment | |
to verify they are running on healthy nodes and then contact support. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeDeploymentReplicasMismatch.md | |
summary: Deployment has not matched the expected number of replicas | |
syn_component: openshift4-monitoring | |
- expr: "(\n kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\ | |
- ,job=\"kube-state-metrics\"}\n !=\n kube_deployment_status_replicas_available{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\ | |
- ) and (\n changes(kube_deployment_status_replicas_updated{namespace=~\"\ | |
- (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\ | |
- \ ==\n 0\n) and cluster:control_plane:all_nodes_ready\n" | |
+ expr: "(((\n kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}\n >\n kube_deployment_status_replicas_available{namespace=~\"\ | |
+ (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\ | |
+ \ changes(kube_deployment_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\ | |
+ ,job=\"kube-state-metrics\"}[5m])\n ==\n 0\n)) * on() group_left cluster:control_plane:all_nodes_ready)\ | |
+ \ > 0\n" | |
for: 15m | |
labels: | |
severity: warning | |
syn: 'true' | |
- alert: SYN_MultipleContainersOOMKilled | |
annotations: | |
- message: Multiple containers were out of memory killed within the past | |
- 15 minutes. | |
+ description: Multiple containers were out of memory killed within the | |
+ past 15 minutes. There are many potential causes of OOM errors, however | |
+ issues on a specific node or containers breaching their limits is common. | |
+ summary: Containers are being killed due to OOM | |
syn_component: openshift4-monitoring | |
expr: sum(max by(namespace, container, pod) (increase(kube_pod_container_status_restarts_total[12m])) | |
and max by(namespace, container, pod) (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}) | |
== 1) > 5 | |
for: 15m | |
labels: | |
+ namespace: kube-system | |
severity: info | |
syn: 'true' | |
- name: syn-openshift-monitoring.rules | |
@@ -2253,6 +2328,7 @@ | |
' | |
for: 1h | |
labels: | |
+ namespace: openshift-kube-apiserver | |
severity: info | |
syn: 'true' | |
- alert: SYN_APIRemovedInNextReleaseInUse | |
@@ -2264,13 +2340,14 @@ | |
{{ $labels.resource }}.{{ $labels.version }}.{{ $labels.group }} -o | |
yaml` to identify the workload. | |
syn_component: openshift4-monitoring | |
- expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.22"}) | |
+ expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.23"}) | |
by (group,version,resource) and (sum by(group,version,resource) (rate(apiserver_request_total{system_client!="kube-controller-manager",system_client!="cluster-policy-controller"}[4h]))) | |
> 0 | |
' | |
for: 1h | |
labels: | |
+ namespace: openshift-kube-apiserver | |
severity: info | |
syn: 'true' | |
- name: syn-prometheus | |
@@ -2325,6 +2402,22 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
+ - alert: SYN_PrometheusLabelLimitHit | |
+ annotations: | |
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped | |
+ {{ printf "%.0f" $value }} targets because some samples exceeded the | |
+ configured label_limit, label_name_length_limit or label_value_length_limit. | |
+ summary: Prometheus has dropped targets because some scrape configs have | |
+ exceeded the labels limit. | |
+ syn_component: openshift4-monitoring | |
+ expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
+ > 0 | |
+ | |
+ ' | |
+ for: 15m | |
+ labels: | |
+ severity: warning | |
+ syn: 'true' | |
- alert: SYN_PrometheusMissingRuleEvaluations | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed | |
@@ -2518,6 +2611,21 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
+ - alert: SYN_PrometheusTargetSyncFailure | |
+ annotations: | |
+ description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} | |
+ have failed to sync because invalid configuration was supplied.' | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusTargetSyncFailure.md | |
+ summary: Prometheus has failed to sync targets. | |
+ syn_component: openshift4-monitoring | |
+ expr: 'increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m]) | |
+ > 0 | |
+ | |
+ ' | |
+ for: 5m | |
+ labels: | |
+ severity: critical | |
+ syn: 'true' | |
- name: syn-prometheus-operator | |
rules: | |
- alert: SYN_PrometheusOperatorListErrors | |
@@ -2623,6 +2731,20 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
+ - name: syn-scheduler-legacy-policy-deprecated | |
+ rules: | |
+ - alert: SYN_SchedulerLegacyPolicySet | |
+ annotations: | |
+ message: The scheduler is currently configured to use a legacy scheduler | |
+ policy API. Use of the policy API is deprecated and removed in 4.10. | |
+ syn_component: openshift4-monitoring | |
+ expr: 'cluster_legacy_scheduler_policy > 0 | |
+ | |
+ ' | |
+ for: 60m | |
+ labels: | |
+ severity: warning | |
+ syn: 'true' | |
- name: syn-system-memory-exceeds-reservation | |
rules: | |
- alert: SYN_SystemMemoryExceedsReservation | |
@@ -2637,11 +2759,7 @@ | |
change or at steady state). | |
syn_component: openshift4-monitoring | |
expr: 'sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum | |
- by (node) (kube_node_status_capacity{resource="memory"}) - sum by (node) | |
- (kube_node_status_capacity{resource="hugepages_1Gi"}) - sum by (node) | |
- (kube_node_status_capacity{resource="hugepages_2Mi"}) - sum by (node) | |
- (kube_node_status_allocatable{resource="memory"}) - sum by (node) (kube_node_status_allocatable{resource="hugepages_1Gi"}) | |
- - sum by (node) (kube_node_status_allocatable{resource="hugepages_2Mi"})) | |
+ by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) | |
* 0.95) | |
' | |
@@ -2653,12 +2771,12 @@ | |
rules: | |
- alert: SYN_ThanosQueryGrpcClientErrorRate | |
annotations: | |
- description: Thanos Query {{$labels.job}} is failing to send {{ $value | |
- | humanize }}% of requests. | |
+ description: Thanos Query {{$labels.job}} is failing to send {{$value | |
+ | humanize}}% of requests. | |
summary: Thanos Query is failing to send requests. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(grpc_client_handled_total{grpc_code!=\"OK\"\ | |
- , job=\"thanos-querier\"}[5m]))\n/\n sum by (job) (rate(grpc_client_started_total{job=\"\ | |
+ expr: "(\n sum by (job, namespace) (rate(grpc_client_handled_total{grpc_code!=\"\ | |
+ OK\", job=\"thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(grpc_client_started_total{job=\"\ | |
thanos-querier\"}[5m]))\n) * 100 > 5\n" | |
for: 1h | |
labels: | |
@@ -2666,12 +2784,13 @@ | |
syn: 'true' | |
- alert: SYN_ThanosQueryGrpcServerErrorRate | |
annotations: | |
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value | |
- | humanize }}% of requests. | |
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value | |
+ | humanize}}% of requests. | |
summary: Thanos Query is failing to handle requests. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ | |
- , job=\"thanos-querier\"}[5m]))\n/\n sum by (job) (rate(grpc_server_started_total{job=\"\ | |
+ expr: "(\n sum by (job, namespace) (rate(grpc_server_handled_total{grpc_code=~\"\ | |
+ Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ | |
+ , job=\"thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(grpc_server_started_total{job=\"\ | |
thanos-querier\"}[5m]))\n* 100 > 5\n)\n" | |
for: 1h | |
labels: | |
@@ -2679,12 +2798,12 @@ | |
syn: 'true' | |
- alert: SYN_ThanosQueryHighDNSFailures | |
annotations: | |
- description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% | |
+ description: Thanos Query {{$labels.job}} have {{$value | humanize}}% | |
of failing DNS queries for store endpoints. | |
summary: Thanos Query is having high number of DNS failures. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=\"\ | |
- thanos-querier\"}[5m]))\n/\n sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\ | |
+ expr: "(\n sum by (job, namespace) (rate(thanos_query_store_apis_dns_failures_total{job=\"\ | |
+ thanos-querier\"}[5m]))\n/\n sum by (job, namespace) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\ | |
thanos-querier\"}[5m]))\n) * 100 > 1\n" | |
for: 1h | |
labels: | |
@@ -2692,26 +2811,28 @@ | |
syn: 'true' | |
- alert: SYN_ThanosQueryHttpRequestQueryErrorRateHigh | |
annotations: | |
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value | |
- | humanize }}% of "query" requests. | |
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value | |
+ | humanize}}% of "query" requests. | |
summary: Thanos Query is failing to handle requests. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\ | |
- , handler=\"query\"}[5m]))\n/\n sum(rate(http_requests_total{job=\"thanos-querier\"\ | |
- , handler=\"query\"}[5m]))\n) * 100 > 5\n" | |
+ expr: "(\n sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\ | |
+ , job=\"thanos-querier\", handler=\"query\"}[5m]))\n/\n sum by (job,\ | |
+ \ namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\ | |
+ query\"}[5m]))\n) * 100 > 5\n" | |
for: 1h | |
labels: | |
severity: warning | |
syn: 'true' | |
- alert: SYN_ThanosQueryHttpRequestQueryRangeErrorRateHigh | |
annotations: | |
- description: Thanos Query {{$labels.job}} is failing to handle {{ $value | |
- | humanize }}% of "query_range" requests. | |
+ description: Thanos Query {{$labels.job}} is failing to handle {{$value | |
+ | humanize}}% of "query_range" requests. | |
summary: Thanos Query is failing to handle requests. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\ | |
- , handler=\"query_range\"}[5m]))\n/\n sum(rate(http_requests_total{job=\"\ | |
- thanos-querier\", handler=\"query_range\"}[5m]))\n) * 100 > 5\n" | |
+ expr: "(\n sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\ | |
+ , job=\"thanos-querier\", handler=\"query_range\"}[5m]))\n/\n sum by\ | |
+ \ (job, namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\ | |
+ query_range\"}[5m]))\n) * 100 > 5\n" | |
for: 1h | |
labels: | |
severity: warning | |
@@ -2720,24 +2841,25 @@ | |
rules: | |
- alert: SYN_ThanosNoRuleEvaluations | |
annotations: | |
- description: Thanos Rule {{$labels.job}} did not perform any rule evaluations | |
- in the past 2 minutes. | |
+ description: Thanos Rule {{$labels.instance}} did not perform any rule | |
+ evaluations in the past 10 minutes. | |
summary: Thanos Rule did not perform any rule evaluations. | |
syn_component: openshift4-monitoring | |
- expr: "sum(rate(prometheus_rule_evaluations_total{job=\"thanos-ruler\"}[2m]))\ | |
- \ <= 0\n and\nsum(thanos_rule_loaded_rules{job=\"thanos-ruler\"}) > 0\n" | |
- for: 3m | |
+ expr: "sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\ | |
+ thanos-ruler\"}[5m])) <= 0\n and\nsum by (job, instance) (thanos_rule_loaded_rules{job=\"\ | |
+ thanos-ruler\"}) > 0\n" | |
+ for: 5m | |
labels: | |
severity: warning | |
syn: 'true' | |
- alert: SYN_ThanosRuleAlertmanagerHighDNSFailures | |
annotations: | |
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% | |
+ description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}% | |
of failing DNS queries for Alertmanager endpoints. | |
summary: Thanos Rule is having high number of DNS failures. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\ | |
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\ | |
+ expr: "(\n sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\ | |
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\ | |
thanos-ruler\"}[5m]))\n* 100 > 1\n)\n" | |
for: 15m | |
labels: | |
@@ -2749,20 +2871,21 @@ | |
configuration. | |
summary: Thanos Rule has not been able to reload configuration. | |
syn_component: openshift4-monitoring | |
- expr: avg(thanos_rule_config_last_reload_successful{job="thanos-ruler"}) | |
- by (job) != 1 | |
+ expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job="thanos-ruler"}) | |
+ != 1 | |
for: 5m | |
labels: | |
severity: info | |
syn: 'true' | |
- alert: SYN_ThanosRuleGrpcErrorRate | |
annotations: | |
- description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | |
- | humanize }}% of requests. | |
+ description: Thanos Rule {{$labels.job}} is failing to handle {{$value | |
+ | humanize}}% of requests. | |
summary: Thanos Rule is failing to handle grpc requests. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ | |
- , job=\"thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(grpc_server_started_total{job=\"\ | |
+ expr: "(\n sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~\"\ | |
+ Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\ | |
+ , job=\"thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(grpc_server_started_total{job=\"\ | |
thanos-ruler\"}[5m]))\n* 100 > 5\n)\n" | |
for: 5m | |
labels: | |
@@ -2770,12 +2893,11 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleHighRuleEvaluationFailures | |
annotations: | |
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to | |
- evaluate rules. | |
+ description: Thanos Rule {{$labels.instance}} is failing to evaluate rules. | |
summary: Thanos Rule is failing to evaluate rules. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=\"\ | |
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(prometheus_rule_evaluations_total{job=\"\ | |
+ expr: "(\n sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=\"\ | |
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\ | |
thanos-ruler\"}[5m]))\n* 100 > 5\n)\n" | |
for: 5m | |
labels: | |
@@ -2783,11 +2905,11 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleHighRuleEvaluationWarnings | |
annotations: | |
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number | |
- of evaluation warnings. | |
+ description: Thanos Rule {{$labels.instance}} has high number of evaluation | |
+ warnings. | |
summary: Thanos Rule has high number of evaluation warnings. | |
syn_component: openshift4-monitoring | |
- expr: 'sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m])) | |
+ expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m])) | |
> 0 | |
' | |
@@ -2797,16 +2919,15 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleNoEvaluationFor10Intervals | |
annotations: | |
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% | |
- rule groups that did not evaluate for at least 10x of their expected | |
- interval. | |
+ description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule | |
+ groups that did not evaluate for at least 10x of their expected interval. | |
summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. | |
syn_component: openshift4-monitoring | |
- expr: 'time() - max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"}) | |
+ expr: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"}) | |
> | |
- 10 * max by (job, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"}) | |
+ 10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"}) | |
' | |
for: 5m | |
@@ -2815,12 +2936,12 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleQueryHighDNSFailures | |
annotations: | |
- description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% | |
- of failing DNS queries for query endpoints. | |
+ description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of | |
+ failing DNS queries for query endpoints. | |
summary: Thanos Rule is having high number of DNS failures. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\ | |
- thanos-ruler\"}[5m]))\n/\n sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\ | |
+ expr: "(\n sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\ | |
+ thanos-ruler\"}[5m]))\n/\n sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\ | |
thanos-ruler\"}[5m]))\n* 100 > 1\n)\n" | |
for: 15m | |
labels: | |
@@ -2828,11 +2949,11 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleQueueIsDroppingAlerts | |
annotations: | |
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to | |
- queue alerts. | |
+ description: Thanos Rule {{$labels.instance}} is failing to queue alerts. | |
+ runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ThanosRuleQueueIsDroppingAlerts.md | |
summary: Thanos Rule is failing to queue alerts. | |
syn_component: openshift4-monitoring | |
- expr: 'sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) | |
+ expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m])) | |
> 0 | |
' | |
@@ -2842,12 +2963,12 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleRuleEvaluationLatencyHigh | |
annotations: | |
- description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation | |
- latency than interval for {{$labels.rule_group}}. | |
+ description: Thanos Rule {{$labels.instance}} has higher evaluation latency | |
+ than interval for {{$labels.rule_group}}. | |
summary: Thanos Rule has high rule evaluation latency. | |
syn_component: openshift4-monitoring | |
- expr: "(\n sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\ | |
- thanos-ruler\"})\n>\n sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=\"\ | |
+ expr: "(\n sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\ | |
+ thanos-ruler\"})\n>\n sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=\"\ | |
thanos-ruler\"})\n)\n" | |
for: 5m | |
labels: | |
@@ -2855,11 +2976,11 @@ | |
syn: 'true' | |
- alert: SYN_ThanosRuleSenderIsFailingAlerts | |
annotations: | |
- description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to | |
- send alerts to alertmanager. | |
+ description: Thanos Rule {{$labels.instance}} is failing to send alerts | |
+ to alertmanager. | |
summary: Thanos Rule is failing to send alerts to alertmanager. | |
syn_component: openshift4-monitoring | |
- expr: 'sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) | |
+ expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m])) | |
> 0 | |
' | |
@@ -2867,47 +2988,3 @@ | |
labels: | |
severity: warning | |
syn: 'true' | |
- - name: syn-thanos-sidecar | |
- rules: | |
- - alert: SYN_ThanosSidecarBucketOperationsFailed | |
- annotations: | |
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} bucket operations | |
- are failing | |
- summary: Thanos Sidecar bucket operations are failing | |
- syn_component: openshift4-monitoring | |
- expr: 'rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m]) | |
- > 0 | |
- | |
- ' | |
- for: 1h | |
- labels: | |
- severity: warning | |
- syn: 'true' | |
- - alert: SYN_ThanosSidecarPrometheusDown | |
- annotations: | |
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect | |
- to Prometheus. | |
- summary: Thanos Sidecar cannot connect to Prometheus | |
- syn_component: openshift4-monitoring | |
- expr: 'sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} | |
- == 0) | |
- | |
- ' | |
- for: 1h | |
- labels: | |
- severity: warning | |
- syn: 'true' | |
- - alert: SYN_ThanosSidecarUnhealthy | |
- annotations: | |
- description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy | |
- for {{ $value }} seconds. | |
- summary: Thanos Sidecar is unhealthy. | |
- syn_component: openshift4-monitoring | |
- expr: 'time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}) | |
- by (job,pod) >= 240 | |
- | |
- ' | |
- for: 1h | |
- labels: | |
- severity: warning | |
- syn: 'true' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment