bastjan · November 15, 2021 15:23
diff --git a/release4.8-4.9.diff b/release4.8-4.9.diff
 diff -rub compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml
 --- compiled-4.8/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml	2021-11-15 16:12:31.000000000 +0100
 +++ compiled/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml	2021-11-15 16:13:35.000000000 +0100
 @@ -156,8 +156,10 @@
             syn: 'true'
         - alert: SYN_SamplesTBRInaccessibleOnBoot
           annotations:
 -            message: 'Samples operator could not access ''registry.redhat.io'' during
 -              its initial installation and it bootstrapped as removed.
 +            message: 'One of two situations has occurred.  Either
 +
 +              samples operator could not access ''registry.redhat.io'' during its
 +              initial installation and it bootstrapped as removed.
 
               If this is expected, and stems from installing in a restricted network
               environment, please note that if you
 @@ -175,7 +177,11 @@
 
               assist the mirroring process.
 
 -              '
 +              Or, the use of allowed registries or blocked registries with global
 +              imagestream configuration will not allow
 +
 +              samples operator to create imagestreams using the default image registry
 +              ''registry.redhat.io''.'
             syn_component: openshift4-monitoring
           expr: openshift_samples_tbr_inaccessible_info == 1
           for: 2d
 @@ -250,6 +256,7 @@
           annotations:
             description: Configuration has failed to load for {{ $labels.namespace
               }}/{{ $labels.pod}}.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertmanagerFailedReload.md
             summary: Reloading an Alertmanager configuration has failed.
             syn_component: openshift4-monitoring
           expr: '# Without max_over_time, failed scrapes could create false negatives,
 @@ -301,18 +308,6 @@
       rules: []
     - name: syn-cluster-machine-approver.rules
       rules:
 -        - alert: SYN_ClusterMachineApproverDown
 -          annotations:
 -            message: ClusterMachineApprover has disappeared from Prometheus target
 -              discovery.
 -            syn_component: openshift4-monitoring
 -          expr: 'absent(up{job="machine-approver"} == 1)
 -
 -            '
 -          for: 10m
 -          labels:
 -            severity: critical
 -            syn: 'true'
         - alert: SYN_MachineApproverMaxPendingCSRsReached
           annotations:
             message: max pending CSRs threshold reached.
 @@ -328,7 +323,7 @@
       rules:
         - alert: SYN_ClusterProxyApplySlow
           annotations:
 -            message: The cluster is taking too long, on average, to apply kubernetes
 +            summary: The cluster is taking too long, on average, to apply kubernetes
               service rules to iptables.
             syn_component: openshift4-monitoring
           expr: 'histogram_quantile(0.95, sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket[5m]))
 @@ -340,7 +335,7 @@
             syn: 'true'
         - alert: SYN_NodeProxyApplySlow
           annotations:
 -            message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
 +            summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
               {{"}}"}} is taking too long, on average, to apply kubernetes service
               rules to iptables.
             syn_component: openshift4-monitoring
 @@ -352,7 +347,7 @@
             syn: 'true'
         - alert: SYN_NodeProxyApplyStale
           annotations:
 -            message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
 +            summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
               {{"}}"}} has stale kubernetes service rules in iptables.
             syn_component: openshift4-monitoring
           expr: '(kubeproxy_sync_proxy_rules_last_queued_timestamp_seconds - kubeproxy_sync_proxy_rules_last_timestamp_seconds)
 @@ -368,10 +363,8 @@
             syn: 'true'
         - alert: SYN_NodeWithoutSDNPod
           annotations:
 -            message: 'All nodes should be running an sdn pod, {{"{{"}} $labels.node
 +            summary: All nodes should be running an sdn pod, {{"{{"}} $labels.node
               {{"}}"}} is not.
 -
 -              '
             syn_component: openshift4-monitoring
           expr: '(kube_node_info unless on(node) topk by (node) (1, kube_pod_info{namespace="openshift-sdn",  pod=~"sdn.*"}))
             > 0
 @@ -383,7 +376,7 @@
             syn: 'true'
         - alert: SYN_SDNPodNotReady
           annotations:
 -            message: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
 +            summary: SDN pod {{"{{"}} $labels.pod {{"}}"}} on node {{"{{"}} $labels.node
               {{"}}"}} is not ready.
             syn_component: openshift4-monitoring
           expr: 'kube_pod_status_ready{namespace=''openshift-sdn'', condition=''true''}
 @@ -398,16 +391,18 @@
       rules:
         - alert: SYN_ClusterNotUpgradeable
           annotations:
 -            message: One or more cluster operators have been blocking minor version
 -              cluster upgrades for at least an hour for reason {{ with $cluster_operator_conditions
 -              := "cluster_operator_conditions" | query}}{{range $value := .}}{{if
 -              and (eq (label "name" $value) "version") (eq (label "condition" $value)
 -              "Upgradeable") (eq (label "endpoint" $value) "metrics") (eq (value $value)
 -              0.0) (ne (len (label "reason" $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}}
 -              {{ with $console_url := "console_url" | query }}{{ if ne (len (label
 -              "url" (first $console_url ) ) ) 0}} For more information refer to {{
 -              label "url" (first $console_url ) }}/settings/cluster/.{{ end }}{{ end
 -              }}
 +            description: In most cases, you will still be able to apply patch releases.
 +              Reason {{ with $cluster_operator_conditions := "cluster_operator_conditions"
 +              | query}}{{range $value := .}}{{if and (eq (label "name" $value) "version")
 +              (eq (label "condition" $value) "Upgradeable") (eq (label "endpoint"
 +              $value) "metrics") (eq (value $value) 0.0) (ne (len (label "reason"
 +              $value)) 0) }}{{label "reason" $value}}.{{end}}{{end}}{{end}} For more
 +              information refer to 'oc adm upgrade'{{ with $console_url := "console_url"
 +              | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
 +              {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
 +              end }}.
 +            summary: One or more cluster operators have been blocking minor version
 +              cluster upgrades for at least an hour.
             syn_component: openshift4-monitoring
           expr: 'max by (name, condition, endpoint) (cluster_operator_conditions{name="version",
             condition="Upgradeable", endpoint="metrics"} == 0)
 @@ -419,9 +414,14 @@
             syn: 'true'
         - alert: SYN_ClusterOperatorDegraded
           annotations:
 -            message: Cluster operator {{ $labels.name }} has been degraded for 30
 -              minutes. Operator is degraded because {{ $labels.reason }} and cluster
 -              upgrades will be unstable.
 +            description: The {{ $labels.name }} operator is degraded because {{ $labels.reason
 +              }}, and the components it manages may have reduced quality of service.  Cluster
 +              upgrades may not complete. For more information refer to 'oc get -o
 +              yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url"
 +              | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
 +              {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
 +              end }}.
 +            summary: Cluster operator has been degraded for 30 minutes.
             syn_component: openshift4-monitoring
           expr: "(\n  cluster_operator_conditions{job=\"cluster-version-operator\"\
             , condition=\"Degraded\"}\n  or on (name)\n  group by (name) (cluster_operator_up{job=\"\
 @@ -432,9 +432,14 @@
             syn: 'true'
         - alert: SYN_ClusterOperatorDown
           annotations:
 -            message: Cluster operator {{ $labels.name }} has not been available for
 -              10 minutes. Operator may be down or disabled, cluster will not be kept
 -              up to date and upgrades will not be possible.
 +            description: The {{ $labels.name }} operator may be down or disabled,
 +              and the components it manages may be unavailable or degraded.  Cluster
 +              upgrades may not complete. For more information refer to 'oc get -o
 +              yaml clusteroperator {{ $labels.name }}'{{ with $console_url := "console_url"
 +              | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} or
 +              {{ label "url" (first $console_url ) }}/settings/cluster/{{ end }}{{
 +              end }}.
 +            summary: Cluster operator has not been available for 10 minutes.
             syn_component: openshift4-monitoring
           expr: 'cluster_operator_up{job="cluster-version-operator"} == 0
 
 @@ -445,8 +450,12 @@
             syn: 'true'
         - alert: SYN_ClusterOperatorFlapping
           annotations:
 -            message: Cluster operator {{ $labels.name }} up status is changing often.
 -              This might cause upgrades to be unstable.
 +            description: The  {{ $labels.name }} operator behavior might cause upgrades
 +              to be unstable. For more information refer to 'oc get -o yaml clusteroperator
 +              {{ $labels.name }}'{{ with $console_url := "console_url" | query }}{{
 +              if ne (len (label "url" (first $console_url ) ) ) 0}} or {{ label "url"
 +              (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}.
 +            summary: Cluster operator up status is changing often.
             syn_component: openshift4-monitoring
           expr: 'changes(cluster_operator_up{job="cluster-version-operator"}[2m])
             > 2
 @@ -460,8 +469,11 @@
       rules:
         - alert: SYN_CannotRetrieveUpdates
           annotations:
 -            message: Cluster version operator has not retrieved updates in {{ $value
 -              | humanizeDuration }}. Failure reason {{ with $cluster_operator_conditions
 +            description: Failure to retrieve updates means that cluster administrators
 +              will need to monitor for available updates on their own or risk falling
 +              behind on security or other bugfixes. If the failure is expected, you
 +              can clear spec.channel in the ClusterVersion object to tell the cluster-version
 +              operator to not retrieve updates. Failure reason {{ with $cluster_operator_conditions
               := "cluster_operator_conditions" | query}}{{range $value := .}}{{if
               and (eq (label "name" $value) "version") (eq (label "condition" $value)
               "RetrievedUpdates") (eq (label "endpoint" $value) "metrics") (eq (value
 @@ -469,6 +481,8 @@
               $console_url := "console_url" | query }}{{ if ne (len (label "url" (first
               $console_url ) ) ) 0}} For more information refer to {{ label "url"
               (first $console_url ) }}/settings/cluster/.{{ end }}{{ end }}
 +            summary: Cluster version operator has not retrieved updates in {{ $value
 +              | humanizeDuration }}.
             syn_component: openshift4-monitoring
           expr: '(time()-cluster_version_operator_update_retrieval_timestamp_seconds)
             >= 3600 and ignoring(condition, name, reason) cluster_operator_conditions{name="version",
 @@ -480,9 +494,15 @@
             syn: 'true'
         - alert: SYN_ClusterVersionOperatorDown
           annotations:
 -            message: Cluster version operator has disappeared from Prometheus target
 -              discovery. Operator may be down or disabled, cluster will not be kept
 -              up to date and upgrades will not be possible.
 +            description: The operator may be down or disabled. The cluster will not
 +              be kept up to date and upgrades will not be possible. Inspect the openshift-cluster-version
 +              namespace for events or changes to the cluster-version-operator deployment
 +              or pods to diagnose and repair. {{ with $console_url := "console_url"
 +              | query }}{{ if ne (len (label "url" (first $console_url ) ) ) 0}} For
 +              more information refer to {{ label "url" (first $console_url ) }}/k8s/cluster/projects/openshift-cluster-version.{{
 +              end }}{{ end }}
 +            summary: Cluster version operator has disappeared from Prometheus target
 +              discovery.
             syn_component: openshift4-monitoring
           expr: 'absent(up{job="cluster-version-operator"} == 1)
 
 @@ -554,13 +574,14 @@
             syn: 'true'
         - alert: SYN_UpdateAvailable
           annotations:
 -            message: Your upstream update recommendation service recommends you update
 -              your cluster.  For more information refer to 'oc adm upgrade'{{ with
 -              $console_url := "console_url" | query }}{{ if ne (len (label "url" (first
 -              $console_url ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{
 +            description: For more information refer to 'oc adm upgrade'{{ with $console_url
 +              := "console_url" | query }}{{ if ne (len (label "url" (first $console_url
 +              ) ) ) 0}} or {{ label "url" (first $console_url ) }}/settings/cluster/{{
               end }}{{ end }}.
 +            summary: Your upstream update recommendation service recommends you update
 +              your cluster.
             syn_component: openshift4-monitoring
 -          expr: 'cluster_version_available_updates > 0
 +          expr: 'sum by (channel,upstream) (cluster_version_available_updates) > 0
 
             '
           labels:
 @@ -582,13 +603,14 @@
               more CPU pressure is likely to cause a failover; increase available
               CPU.
             syn_component: openshift4-monitoring
 -          expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))
 +          expr: '100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1m]))
             * 100) > 90 AND on (instance) label_replace( kube_node_role{role="master"},
             "instance", "$1", "node", "(.+)" )
 
             '
           for: 5m
           labels:
 +            namespace: openshift-kube-apiserver
             severity: critical
             syn: 'true'
         - alert: SYN_HighOverallControlPlaneCPU
 @@ -605,11 +627,12 @@
               outage may cause a cascading failure; increase available CPU.
             syn_component: openshift4-monitoring
           expr: "sum(\n  100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"\
 -            idle\"}[5m])) * 100)\n  AND on (instance) label_replace( kube_node_role{role=\"\
 +            idle\"}[1m])) * 100)\n  AND on (instance) label_replace( kube_node_role{role=\"\
             master\"}, \"instance\", \"$1\", \"node\", \"(.+)\" )\n)\n/\ncount(kube_node_role{role=\"\
             master\"})\n> 60\n"
           for: 10m
           labels:
 +            namespace: openshift-kube-apiserver
             severity: warning
             syn: 'true'
     - name: syn-etcd
 @@ -791,11 +814,12 @@
             syn_component: openshift4-monitoring
           expr: vector(1)
           labels:
 +            namespace: openshift-monitoring
             severity: none
             syn: 'true'
     - name: syn-k8s.rules
       rules: []
 -    - name: syn-kube-apiserver-slos
 +    - name: syn-kube-apiserver-slos-basic
       rules:
         - alert: SYN_KubeAPIErrorBudgetBurn
           annotations:
 @@ -816,6 +840,7 @@
           for: 2m
           labels:
             long: 1h
 +            namespace: openshift-kube-apiserver
             severity: critical
             short: 5m
             syn: 'true'
 @@ -838,53 +863,10 @@
           for: 15m
           labels:
             long: 6h
 +            namespace: openshift-kube-apiserver
             severity: critical
             short: 30m
             syn: 'true'
 -        - alert: SYN_KubeAPIErrorBudgetBurn
 -          annotations:
 -            description: The API server is burning too much error budget. This alert
 -              fires when too many requests are failing with high latency. Use the
 -              'API Performance' monitoring dashboards to narrow down the request states
 -              and latency. The 'etcd' monitoring dashboards also provides metrics
 -              to help determine etcd stability and performance.
 -            summary: The API server is burning too much error budget.
 -            syn_component: openshift4-monitoring
 -          expr: 'sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
 -
 -            and
 -
 -            sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
 -
 -            '
 -          for: 1h
 -          labels:
 -            long: 1d
 -            severity: warning
 -            short: 2h
 -            syn: 'true'
 -        - alert: SYN_KubeAPIErrorBudgetBurn
 -          annotations:
 -            description: The API server is burning too much error budget. This alert
 -              fires when too many requests are failing with high latency. Use the
 -              'API Performance' monitoring dashboards to narrow down the request states
 -              and latency. The 'etcd' monitoring dashboards also provides metrics
 -              to help determine etcd stability and performance.
 -            summary: The API server is burning too much error budget.
 -            syn_component: openshift4-monitoring
 -          expr: 'sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
 -
 -            and
 -
 -            sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
 -
 -            '
 -          for: 3h
 -          labels:
 -            long: 3d
 -            severity: warning
 -            short: 6h
 -            syn: 'true'
     - name: syn-kube-apiserver.rules
       rules: []
     - name: syn-kube-prometheus-general.rules
 @@ -933,7 +915,7 @@
               $labels.container}} has been in waiting state for longer than 1 hour.
             summary: Pod container waiting longer than 1 hour
             syn_component: openshift4-monitoring
 -          expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"})
 +          expr: 'sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"})
             > 0
 
             '
 @@ -947,7 +929,7 @@
               $labels.daemonset }} are running where they are not supposed to run.'
             summary: DaemonSet pods are misscheduled.
             syn_component: openshift4-monitoring
 -          expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
 +          expr: 'kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
             > 0
 
             '
 @@ -961,10 +943,9 @@
               $labels.daemonset }} are not scheduled.'
             summary: DaemonSet pods are not scheduled.
             syn_component: openshift4-monitoring
 -          expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\"}\n  -\nkube_daemonset_status_current_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"} >\
 -            \ 0\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"} > 0\n"
           for: 10m
           labels:
             severity: warning
 @@ -976,20 +957,18 @@
             summary: DaemonSet rollout is stuck.
             syn_component: openshift4-monitoring
           expr: "(\n  (\n    kube_daemonset_status_current_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \     !=\n    kube_daemonset_status_desired_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  ) or (\n    kube_daemonset_status_number_misscheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \     !=\n    0\n  ) or (\n    kube_daemonset_updated_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \     !=\n    kube_daemonset_status_desired_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  ) or (\n    kube_daemonset_status_number_available{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"}\n     !=\n    kube_daemonset_status_desired_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  )\n) and (\n  changes(kube_daemonset_updated_number_scheduled{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n     !=\n\
 +            \    kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  ) or (\n    kube_daemonset_status_number_misscheduled{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n     !=\n\
 +            \    0\n  ) or (\n    kube_daemonset_updated_number_scheduled{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n     !=\n\
 +            \    kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  ) or (\n    kube_daemonset_status_number_available{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n     !=\n\
 +            \    kube_daemonset_status_desired_number_scheduled{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  )\n) and (\n  changes(kube_daemonset_updated_number_scheduled{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n  \
             \    ==\n  0\n)\n"
           for: 30m
           labels:
 @@ -1002,46 +981,44 @@
               has not been rolled back.
             summary: Deployment generation mismatch due to possible roll-back
             syn_component: openshift4-monitoring
 -          expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_deployment_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\"}\n  !=\nkube_deployment_metadata_generation{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
           for: 15m
           labels:
             severity: warning
             syn: 'true'
         - alert: SYN_KubeHpaMaxedOut
           annotations:
 -            description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
 -              at max replicas for longer than 15 minutes.
 +            description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }}
 +              has been running at max replicas for longer than 15 minutes.
             summary: HPA is running at max replicas
             syn_component: openshift4-monitoring
           expr: "kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"}\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n  ==\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
           for: 15m
           labels:
             severity: warning
             syn: 'true'
         - alert: SYN_KubeHpaReplicasMismatch
           annotations:
 -            description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
 -              the desired number of replicas for longer than 15 minutes.
 +            description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }}
 +              has not matched the desired number of replicas for longer than 15 minutes.
             summary: HPA has not matched descired number of replicas.
             syn_component: openshift4-monitoring
           expr: "(kube_horizontalpodautoscaler_status_desired_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"})\n\
 -            \  and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"})\n  and\n(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"})\n  and\nchanges(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[15m])\
 -            \ == 0\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n  !=\nkube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n  and\n\
 +            (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  >\nkube_horizontalpodautoscaler_spec_min_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n  and\n\
 +            (kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  <\nkube_horizontalpodautoscaler_spec_max_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"})\n  and\n\
 +            changes(kube_horizontalpodautoscaler_status_current_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[15m]) ==\
 +            \ 0\n"
           for: 15m
           labels:
             severity: warning
 @@ -1052,8 +1029,8 @@
               more than 12 hours to complete.
             summary: Job did not complete in time
             syn_component: openshift4-monitoring
 -          expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
 -            - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}  >
 +          expr: 'kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
 +            - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}  >
             0
 
             '
 @@ -1066,9 +1043,10 @@
             description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed
               to complete. Removing failed job after investigation should clear this
               alert.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeJobFailed.md
             summary: Job failed to complete.
             syn_component: openshift4-monitoring
 -          expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}  >
 +          expr: 'kube_job_failed{namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}  >
             0
 
             '
 @@ -1078,12 +1056,13 @@
             syn: 'true'
         - alert: SYN_KubePodCrashLooping
           annotations:
 -            description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
 -              }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes.
 +            description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
 +              }}) is in waiting state (reason: "CrashLoopBackOff").'
             summary: Pod is crash looping.
             syn_component: openshift4-monitoring
 -          expr: 'rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[10m])
 -            * 60 * 5 > 0
 +          expr: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
 +            namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}[5m])
 +            >= 1
 
             '
           for: 15m
 @@ -1094,10 +1073,11 @@
           annotations:
             description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in
               a non-ready state for longer than 15 minutes.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md
             summary: Pod has been in a non-ready state for more than 15 minutes.
             syn_component: openshift4-monitoring
           expr: "sum by (namespace, pod) (\n  max by(namespace, pod) (\n    kube_pod_status_phase{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\", phase=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\", phase=~\"\
             Pending|Unknown\"}\n  ) * on(namespace, pod) group_left(owner_kind) topk\
             \ by(namespace, pod) (\n    1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!=\"\
             Job\"})\n  )\n) > 0\n"
 @@ -1112,9 +1092,9 @@
               has not been rolled back.
             summary: StatefulSet generation mismatch due to possible roll-back
             syn_component: openshift4-monitoring
 -          expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_statefulset_status_observed_generation{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\"}\n  !=\nkube_statefulset_metadata_generation{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n"
           for: 15m
           labels:
             severity: warning
 @@ -1126,12 +1106,11 @@
               minutes.
             summary: Deployment has not matched the expected number of replicas.
             syn_component: openshift4-monitoring
 -          expr: "(\n  kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "(\n  kube_statefulset_status_replicas_ready{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\"}\n    !=\n  kube_statefulset_status_replicas{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            ) and (\n  changes(kube_statefulset_status_replicas_updated{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[10m])\n\
 -            \    ==\n  0\n)\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\
 +            \  changes(kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}[10m])\n    ==\n  0\n)\n"
           for: 15m
           labels:
             severity: warning
 @@ -1143,14 +1122,13 @@
             summary: StatefulSet update has not been rolled out.
             syn_component: openshift4-monitoring
           expr: "(\n  max without (revision) (\n    kube_statefulset_status_current_revision{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \      unless\n    kube_statefulset_status_update_revision{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  )\n    *\n  (\n    kube_statefulset_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"}\n      !=\n    kube_statefulset_status_replicas_updated{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            \  )\n)  and (\n  changes(kube_statefulset_status_replicas_updated{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n      unless\n\
 +            \    kube_statefulset_status_update_revision{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  )\n    *\n  (\n    kube_statefulset_replicas{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n      !=\n\
 +            \    kube_statefulset_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n  )\n)  and (\n  changes(kube_statefulset_status_replicas_updated{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}[5m])\n  \
             \    ==\n  0\n)\n"
           for: 15m
           labels:
 @@ -1163,15 +1141,21 @@
         - alert: SYN_KubeCPUOvercommit
           annotations:
             description: Cluster has overcommitted CPU resource requests for Pods
 -              and cannot tolerate node failure.
 +              by {{ $value }} CPU shares and cannot tolerate node failure.
             summary: Cluster has overcommitted CPU resource requests.
             syn_component: openshift4-monitoring
 -          expr: "sum(namespace_cpu:kube_pod_container_resource_requests:sum{})\n \
 -            \ /\nsum(kube_node_status_allocatable{resource=\"cpu\"})\n  >\n((count(kube_node_status_allocatable{resource=\"\
 -            cpu\"}) > 1) - 1) / count(kube_node_status_allocatable{resource=\"cpu\"\
 -            })\n"
 -          for: 5m
 +          expr: 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"})
 +            - max(kube_node_status_allocatable{resource="cpu"})) > 0
 +
 +            and
 +
 +            (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"}))
 +            > 0
 +
 +            '
 +          for: 10m
           labels:
 +            namespace: kube-system
             severity: warning
             syn: 'true'
         - alert: SYN_KubeCPUQuotaOvercommit
 @@ -1179,7 +1163,7 @@
             description: Cluster has overcommitted CPU resource requests for Namespaces.
             summary: Cluster has overcommitted CPU resource requests.
             syn_component: openshift4-monitoring
 -          expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n  /\n\
             sum(kube_node_status_allocatable{resource=\"cpu\"})\n  > 1.5\n"
           for: 5m
 @@ -1189,15 +1173,22 @@
         - alert: SYN_KubeMemoryOvercommit
           annotations:
             description: Cluster has overcommitted memory resource requests for Pods
 -              and cannot tolerate node failure.
 +              by {{ $value }} bytes and cannot tolerate node failure.
             summary: Cluster has overcommitted memory resource requests.
             syn_component: openshift4-monitoring
 -          expr: "sum(namespace_memory:kube_pod_container_resource_requests:sum{})\n\
 -            \  /\nsum(kube_node_status_allocatable{resource=\"memory\"})\n  >\n((count(kube_node_status_allocatable{resource=\"\
 -            memory\"}) > 1) - 1)\n  /\ncount(kube_node_status_allocatable{resource=\"\
 -            memory\"})\n"
 -          for: 5m
 +          expr: 'sum(namespace_memory:kube_pod_container_resource_requests:sum{})
 +            - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"}))
 +            > 0
 +
 +            and
 +
 +            (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"}))
 +            > 0
 +
 +            '
 +          for: 10m
           labels:
 +            namespace: kube-system
             severity: warning
             syn: 'true'
         - alert: SYN_KubeMemoryQuotaOvercommit
 @@ -1205,7 +1196,7 @@
             description: Cluster has overcommitted memory resource requests for Namespaces.
             summary: Cluster has overcommitted memory resource requests.
             syn_component: openshift4-monitoring
 -          expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "sum(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n  /\n\
             sum(kube_node_status_allocatable{resource=\"memory\",job=\"kube-state-metrics\"\
             })\n  > 1.5\n"
 @@ -1219,9 +1210,9 @@
               }} of its {{ $labels.resource }} quota.
             summary: Namespace quota is going to be full.
             syn_component: openshift4-monitoring
 -          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"used\"}\n  / ignoring(instance, job,\
 -            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n  > 0.9 < 1\n"
           for: 15m
           labels:
 @@ -1233,9 +1224,9 @@
               }} of its {{ $labels.resource }} quota.
             summary: Namespace quota has exceeded the limits.
             syn_component: openshift4-monitoring
 -          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"used\"}\n  / ignoring(instance, job,\
 -            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n  > 1\n"
           for: 15m
           labels:
 @@ -1247,9 +1238,9 @@
               }} of its {{ $labels.resource }} quota.
             summary: Namespace quota is fully used.
             syn_component: openshift4-monitoring
 -          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"used\"}\n  / ignoring(instance, job,\
 -            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +            \ type)\n(kube_resourcequota{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kube-state-metrics\", type=\"hard\"} > 0)\n  == 1\n"
           for: 15m
           labels:
 @@ -1263,7 +1254,7 @@
               status {{ $labels.phase }}.
             summary: PersistentVolume is having issues with provisioning.
             syn_component: openshift4-monitoring
 -          expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
 +          expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default)",job="kube-state-metrics"}
             > 0
 
             '
 @@ -1276,14 +1267,14 @@
             description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
               }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
               }} free.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md
             summary: PersistentVolume is filling up.
             syn_component: openshift4-monitoring
 -          expr: "(\n  kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "(\n  kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kubelet\", metrics_path=\"/metrics\"}\n    /\n  kubelet_volume_stats_capacity_bytes{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
 -            /metrics\"}\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
 -            /metrics\"} > 0\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
 +            }\n) < 0.03\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\n"
           for: 1m
           labels:
             severity: critical
 @@ -1294,16 +1285,16 @@
               {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace
               }} is expected to fill up within four days. Currently {{ $value | humanizePercentage
               }} is available.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePersistentVolumeFillingUp.md
             summary: PersistentVolume is filling up.
             syn_component: openshift4-monitoring
 -          expr: "(\n  kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 +          expr: "(\n  kubelet_volume_stats_available_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
             ,job=\"kubelet\", metrics_path=\"/metrics\"}\n    /\n  kubelet_volume_stats_capacity_bytes{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
 -            /metrics\"}\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
 -            /metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kubelet\", metrics_path=\"\
 -            /metrics\"}[6h], 4 * 24 * 3600) < 0\n"
 +            (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
 +            }\n) < 0.15\nand\nkubelet_volume_stats_used_bytes{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kubelet\", metrics_path=\"/metrics\"} > 0\nand\npredict_linear(kubelet_volume_stats_available_bytes{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kubelet\", metrics_path=\"/metrics\"\
 +            }[6h], 4 * 24 * 3600) < 0\n"
           for: 1h
           labels:
             severity: warning
 @@ -1317,8 +1308,8 @@
             summary: Kubernetes API server client is experiencing errors.
             syn_component: openshift4-monitoring
           expr: "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance,\
 -            \ job)\n  /\nsum(rate(rest_client_requests_total[5m])) by (instance, job))\n\
 -            > 0.01\n"
 +            \ job, namespace)\n  /\nsum(rate(rest_client_requests_total[5m])) by (instance,\
 +            \ job, namespace))\n> 0.01\n"
           for: 15m
           labels:
             severity: warning
 @@ -1332,7 +1323,7 @@
             summary: An aggregated API is down.
             syn_component: openshift4-monitoring
           expr: '(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m])))
 -            * 100 < 70
 +            * 100 < 85
 
             '
           for: 5m
 @@ -1356,6 +1347,7 @@
         - alert: SYN_KubeAPIDown
           annotations:
             description: KubeAPI has disappeared from Prometheus target discovery.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md
             summary: Target disappeared from Prometheus target discovery.
             syn_component: openshift4-monitoring
           expr: 'absent(up{job="apiserver"} == 1)
 @@ -1386,6 +1378,7 @@
         - alert: SYN_KubeNodeNotReady
           annotations:
             description: '{{ $labels.node }} has been unready for more than 15 minutes.'
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeNodeNotReady.md
             summary: Node is not ready.
             syn_component: openshift4-monitoring
           expr: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"}
 @@ -1442,6 +1435,7 @@
         - alert: SYN_KubeletDown
           annotations:
             description: Kubelet has disappeared from Prometheus target discovery.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeletDown.md
             summary: Target disappeared from Prometheus target discovery.
             syn_component: openshift4-monitoring
           expr: 'absent(up{job="kubelet", metrics_path="/metrics"} == 1)
 @@ -1449,6 +1443,7 @@
             '
           for: 15m
           labels:
 +            namespace: kube-system
             severity: critical
             syn: 'true'
         - alert: SYN_KubeletPlegDurationHigh
 @@ -1650,6 +1645,21 @@
           labels:
             severity: critical
             syn: 'true'
 +    - name: syn-machine-health-check-unterminated-short-circuit
 +      rules:
 +        - alert: SYN_MachineHealthCheckUnterminatedShortCircuit
 +          annotation:
 +            message: machine health check {{ $labels.name }} has been disabled by
 +              short circuit for more than 30 minutes
 +          annotations:
 +            syn_component: openshift4-monitoring
 +          expr: 'mapi_machinehealthcheck_short_circuit == 1
 +
 +            '
 +          for: 30m
 +          labels:
 +            severity: warning
 +            syn: 'true'
     - name: syn-machine-not-yet-deleted
       rules:
         - alert: SYN_MachineNotYetDeleted
 @@ -1692,6 +1702,27 @@
           labels:
             severity: warning
             syn: 'true'
 +    - name: syn-master-nodes-high-memory-usage
 +      rules:
 +        - alert: SYN_MasterNodesHighMemoryUsage
 +          annotations:
 +            message: Memory usage of {{ $value | humanize }} on {{ $labels.node }}
 +              exceeds 90%. Master nodes starved of memory could result in degraded
 +              performance of the control plane.
 +            syn_component: openshift4-monitoring
 +          expr: '((sum(node_memory_MemTotal_bytes AND on (instance) label_replace(
 +            kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) - sum(node_memory_MemFree_bytes
 +            + node_memory_Buffers_bytes + node_memory_Cached_bytes AND on (instance)
 +            label_replace( kube_node_role{role="master"}, "instance", "$1", "node",
 +            "(.+)" ))) / sum(node_memory_MemTotal_bytes AND on (instance) label_replace(
 +            kube_node_role{role="master"}, "instance", "$1", "node", "(.+)" )) * 100)
 +            > 90
 +
 +            '
 +          for: 15m
 +          labels:
 +            severity: warning
 +            syn: 'true'
     - name: syn-mcd-drain-error
       rules:
         - alert: SYN_MCDDrainError
 @@ -1774,10 +1805,37 @@
           labels:
             severity: warning
             syn: 'true'
 +        - alert: SYN_NodeFileDescriptorLimit
 +          annotations:
 +            description: File descriptors limit at {{ $labels.instance }} is currently
 +              at {{ printf "%.2f" $value }}%.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md
 +            summary: Kernel is predicted to exhaust file descriptors limit soon.
 +            syn_component: openshift4-monitoring
 +          expr: "(\n  node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
 +            node-exporter\"} > 70\n)\n"
 +          for: 15m
 +          labels:
 +            severity: warning
 +            syn: 'true'
 +        - alert: SYN_NodeFileDescriptorLimit
 +          annotations:
 +            description: File descriptors limit at {{ $labels.instance }} is currently
 +              at {{ printf "%.2f" $value }}%.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFileDescriptorLimit.md
 +            summary: Kernel is predicted to exhaust file descriptors limit soon.
 +            syn_component: openshift4-monitoring
 +          expr: "(\n  node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"\
 +            node-exporter\"} > 90\n)\n"
 +          for: 15m
 +          labels:
 +            severity: critical
 +            syn: 'true'
         - alert: SYN_NodeFilesystemAlmostOutOfFiles
           annotations:
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available inodes left.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md
             summary: Filesystem has less than 5% inodes left.
             syn_component: openshift4-monitoring
           expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
 @@ -1792,6 +1850,7 @@
           annotations:
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available inodes left.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfFiles.md
             summary: Filesystem has less than 3% inodes left.
             syn_component: openshift4-monitoring
           expr: "(\n  node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\"\
 @@ -1806,13 +1865,14 @@
           annotations:
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available space left.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md
             summary: Filesystem has less than 5% space left.
             syn_component: openshift4-monitoring
           expr: "(\n  node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
             \"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\
             \ 100 < 5\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
             \"} == 0\n)\n"
 -          for: 1h
 +          for: 30m
           labels:
             severity: warning
             syn: 'true'
 @@ -1820,13 +1880,14 @@
           annotations:
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available space left.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemAlmostOutOfSpace.md
             summary: Filesystem has less than 3% space left.
             syn_component: openshift4-monitoring
           expr: "(\n  node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\
             \"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\"} *\
             \ 100 < 3\nand\n  node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\
             \"} == 0\n)\n"
 -          for: 1h
 +          for: 30m
           labels:
             severity: critical
             syn: 'true'
 @@ -1835,6 +1896,7 @@
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available inodes left and is
               filling up.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md
             summary: Filesystem is predicted to run out of inodes within the next
               24 hours.
             syn_component: openshift4-monitoring
 @@ -1852,6 +1914,7 @@
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available inodes left and is
               filling up fast.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemFilesFillingUp.md
             summary: Filesystem is predicted to run out of inodes within the next
               4 hours.
             syn_component: openshift4-monitoring
 @@ -1869,6 +1932,7 @@
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available space left and is
               filling up.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md
             summary: Filesystem is predicted to run out of space within the next 24
               hours.
             syn_component: openshift4-monitoring
 @@ -1886,6 +1950,7 @@
             description: Filesystem on {{ $labels.device }} at {{ $labels.instance
               }} has only {{ printf "%.2f" $value }}% available space left and is
               filling up fast.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeFilesystemSpaceFillingUp.md
             summary: Filesystem is predicted to run out of space within the next 4
               hours.
             syn_component: openshift4-monitoring
 @@ -1945,6 +2010,7 @@
             description: RAID array '{{ $labels.device }}' on {{ $labels.instance
               }} is in degraded state due to one or more disks failures. Number of
               spare drives is insufficient to fix issue automatically.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeRAIDDegraded.md
             summary: RAID Array is degraded
             syn_component: openshift4-monitoring
           expr: 'node_md_disks_required - ignoring (state) (node_md_disks{state="active"})
 @@ -1985,8 +2051,10 @@
       rules:
         - alert: SYN_NodeNetworkInterfaceFlapping
           annotations:
 -            message: Network interface "{{ $labels.device }}" changing it's up status
 -              often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
 +            description: Network interface "{{ $labels.device }}" changing its up
 +              status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
 +              }}
 +            summary: Network interface is often changing its status
             syn_component: openshift4-monitoring
           expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
             > 2
 @@ -2091,12 +2159,12 @@
             syn_component: openshift4-monitoring
           expr: "count without (node)\n(\n  group by (node, workload, namespace)\n\
             \  (\n    kube_pod_info{node!=\"\"}\n    * on(namespace,pod) group_left(workload)\n\
 -            \    (\n      kube_pod_spec_volumes_persistentvolumeclaims_info\n    \
 +            \    (\n      max by(namespace, pod, workload) (kube_pod_spec_volumes_persistentvolumeclaims_info)\n\
             \  * on(namespace,pod) group_left(workload)\n      (\n        namespace_workload_pod:kube_pod_owner:relabel\n\
             \        * on(namespace,workload,workload_type) group_left()\n       \
             \ (\n          count without(pod) (namespace_workload_pod:kube_pod_owner:relabel{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\"}) > 1\n        )\n      )\n \
 -            \   )\n  )\n) == 1\n"
 +            (openshift-.*|kube-.*|default)\"}) > 1\n        )\n      )\n    )\n  )\n\
 +            ) == 1\n"
           for: 1h
           labels:
             severity: warning
 @@ -2174,21 +2242,24 @@
       rules:
         - alert: SYN_AlertmanagerReceiversNotConfigured
           annotations:
 -            message: Alerts are not configured to be sent to a notification system,
 +            description: Alerts are not configured to be sent to a notification system,
               meaning that you may not be notified in a timely fashion when important
               failures occur. Check the OpenShift documentation to learn how to configure
               notifications with Alertmanager.
 +            summary: Receivers (notification integrations) are not configured on Alertmanager
             syn_component: openshift4-monitoring
 -          expr: cluster:alertmanager_routing_enabled:max == 0
 +          expr: cluster:alertmanager_integrations:max == 0
           for: 10m
           labels:
 +            namespace: openshift-monitoring
             severity: warning
             syn: 'true'
         - alert: SYN_ClusterMonitoringOperatorReconciliationErrors
           annotations:
 -            message: Cluster Monitoring Operator is experiencing unexpected reconciliation
 -              errors. Inspect the cluster-monitoring-operator log for potential root
 -              causes.
 +            description: Errors are occurring during reconciliation cycles. Inspect
 +              the cluster-monitoring-operator log for potential root causes.
 +            summary: Cluster Monitoring Operator is experiencing unexpected reconciliation
 +              errors.
             syn_component: openshift4-monitoring
           expr: max_over_time(cluster_monitoring_operator_last_reconciliation_successful[5m])
             == 0
 @@ -2207,28 +2278,32 @@
               this may indicate a new version of a cluster component cannot start
               due to a bug or configuration error. Assess the pods for this deployment
               to verify they are running on healthy nodes and then contact support.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeDeploymentReplicasMismatch.md
             summary: Deployment has not matched the expected number of replicas
             syn_component: openshift4-monitoring
 -          expr: "(\n  kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default|logging)\"\
 -            ,job=\"kube-state-metrics\"}\n    !=\n  kube_deployment_status_replicas_available{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}\n\
 -            ) and (\n  changes(kube_deployment_status_replicas_updated{namespace=~\"\
 -            (openshift-.*|kube-.*|default|logging)\",job=\"kube-state-metrics\"}[5m])\n\
 -            \    ==\n  0\n) and cluster:control_plane:all_nodes_ready\n"
 +          expr: "(((\n  kube_deployment_spec_replicas{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}\n    >\n  kube_deployment_status_replicas_available{namespace=~\"\
 +            (openshift-.*|kube-.*|default)\",job=\"kube-state-metrics\"}\n) and (\n\
 +            \  changes(kube_deployment_status_replicas_updated{namespace=~\"(openshift-.*|kube-.*|default)\"\
 +            ,job=\"kube-state-metrics\"}[5m])\n    ==\n  0\n)) * on() group_left cluster:control_plane:all_nodes_ready)\
 +            \ > 0\n"
           for: 15m
           labels:
             severity: warning
             syn: 'true'
         - alert: SYN_MultipleContainersOOMKilled
           annotations:
 -            message: Multiple containers were out of memory killed within the past
 -              15 minutes.
 +            description: Multiple containers were out of memory killed within the
 +              past 15 minutes. There are many potential causes of OOM errors, however
 +              issues on a specific node or containers breaching their limits is common.
 +            summary: Containers are being killed due to OOM
             syn_component: openshift4-monitoring
           expr: sum(max by(namespace, container, pod) (increase(kube_pod_container_status_restarts_total[12m]))
             and max by(namespace, container, pod) (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"})
             == 1) > 5
           for: 15m
           labels:
 +            namespace: kube-system
             severity: info
             syn: 'true'
     - name: syn-openshift-monitoring.rules
 @@ -2253,6 +2328,7 @@
             '
           for: 1h
           labels:
 +            namespace: openshift-kube-apiserver
             severity: info
             syn: 'true'
         - alert: SYN_APIRemovedInNextReleaseInUse
 @@ -2264,13 +2340,14 @@
               {{ $labels.resource }}.{{ $labels.version }}.{{ $labels.group }} -o
               yaml` to identify the workload.
             syn_component: openshift4-monitoring
 -          expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.22"})
 +          expr: 'group(apiserver_requested_deprecated_apis{removed_release="1.23"})
             by (group,version,resource) and (sum by(group,version,resource) (rate(apiserver_request_total{system_client!="kube-controller-manager",system_client!="cluster-policy-controller"}[4h])))
             > 0
 
             '
           for: 1h
           labels:
 +            namespace: openshift-kube-apiserver
             severity: info
             syn: 'true'
     - name: syn-prometheus
 @@ -2325,6 +2402,22 @@
           labels:
             severity: warning
             syn: 'true'
 +        - alert: SYN_PrometheusLabelLimitHit
 +          annotations:
 +            description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
 +              {{ printf "%.0f" $value }} targets because some samples exceeded the
 +              configured label_limit, label_name_length_limit or label_value_length_limit.
 +            summary: Prometheus has dropped targets because some scrape configs have
 +              exceeded the labels limit.
 +            syn_component: openshift4-monitoring
 +          expr: 'increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m])
 +            > 0
 +
 +            '
 +          for: 15m
 +          labels:
 +            severity: warning
 +            syn: 'true'
         - alert: SYN_PrometheusMissingRuleEvaluations
           annotations:
             description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed
 @@ -2518,6 +2611,21 @@
           labels:
             severity: warning
             syn: 'true'
 +        - alert: SYN_PrometheusTargetSyncFailure
 +          annotations:
 +            description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}}
 +              have failed to sync because invalid configuration was supplied.'
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusTargetSyncFailure.md
 +            summary: Prometheus has failed to sync targets.
 +            syn_component: openshift4-monitoring
 +          expr: 'increase(prometheus_target_sync_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[30m])
 +            > 0
 +
 +            '
 +          for: 5m
 +          labels:
 +            severity: critical
 +            syn: 'true'
     - name: syn-prometheus-operator
       rules:
         - alert: SYN_PrometheusOperatorListErrors
 @@ -2623,6 +2731,20 @@
           labels:
             severity: warning
             syn: 'true'
 +    - name: syn-scheduler-legacy-policy-deprecated
 +      rules:
 +        - alert: SYN_SchedulerLegacyPolicySet
 +          annotations:
 +            message: The scheduler is currently configured to use a legacy scheduler
 +              policy API. Use of the policy API is deprecated and removed in 4.10.
 +            syn_component: openshift4-monitoring
 +          expr: 'cluster_legacy_scheduler_policy > 0
 +
 +            '
 +          for: 60m
 +          labels:
 +            severity: warning
 +            syn: 'true'
     - name: syn-system-memory-exceeds-reservation
       rules:
         - alert: SYN_SystemMemoryExceedsReservation
 @@ -2637,11 +2759,7 @@
               change or at steady state).
             syn_component: openshift4-monitoring
           expr: 'sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum
 -            by (node) (kube_node_status_capacity{resource="memory"}) - sum by (node)
 -            (kube_node_status_capacity{resource="hugepages_1Gi"}) - sum by (node)
 -            (kube_node_status_capacity{resource="hugepages_2Mi"}) - sum by (node)
 -            (kube_node_status_allocatable{resource="memory"}) - sum by (node)  (kube_node_status_allocatable{resource="hugepages_1Gi"})
 -            - sum by (node)  (kube_node_status_allocatable{resource="hugepages_2Mi"}))
 +            by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"}))
             * 0.95)
 
             '
 @@ -2653,12 +2771,12 @@
       rules:
         - alert: SYN_ThanosQueryGrpcClientErrorRate
           annotations:
 -            description: Thanos Query {{$labels.job}} is failing to send {{ $value
 -              | humanize }}% of requests.
 +            description: Thanos Query {{$labels.job}} is failing to send {{$value
 +              | humanize}}% of requests.
             summary: Thanos Query is failing to send requests.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(grpc_client_handled_total{grpc_code!=\"OK\"\
 -            , job=\"thanos-querier\"}[5m]))\n/\n  sum by (job) (rate(grpc_client_started_total{job=\"\
 +          expr: "(\n  sum by (job, namespace) (rate(grpc_client_handled_total{grpc_code!=\"\
 +            OK\", job=\"thanos-querier\"}[5m]))\n/\n  sum by (job, namespace) (rate(grpc_client_started_total{job=\"\
             thanos-querier\"}[5m]))\n) * 100 > 5\n"
           for: 1h
           labels:
 @@ -2666,12 +2784,13 @@
             syn: 'true'
         - alert: SYN_ThanosQueryGrpcServerErrorRate
           annotations:
 -            description: Thanos Query {{$labels.job}} is failing to handle {{ $value
 -              | humanize }}% of requests.
 +            description: Thanos Query {{$labels.job}} is failing to handle {{$value
 +              | humanize}}% of requests.
             summary: Thanos Query is failing to handle requests.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
 -            , job=\"thanos-querier\"}[5m]))\n/\n  sum by (job) (rate(grpc_server_started_total{job=\"\
 +          expr: "(\n  sum by (job, namespace) (rate(grpc_server_handled_total{grpc_code=~\"\
 +            Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
 +            , job=\"thanos-querier\"}[5m]))\n/\n  sum by (job, namespace) (rate(grpc_server_started_total{job=\"\
             thanos-querier\"}[5m]))\n* 100 > 5\n)\n"
           for: 1h
           labels:
 @@ -2679,12 +2798,12 @@
             syn: 'true'
         - alert: SYN_ThanosQueryHighDNSFailures
           annotations:
 -            description: Thanos Query {{$labels.job}} have {{ $value | humanize }}%
 +            description: Thanos Query {{$labels.job}} have {{$value | humanize}}%
               of failing DNS queries for store endpoints.
             summary: Thanos Query is having high number of DNS failures.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=\"\
 -            thanos-querier\"}[5m]))\n/\n  sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\
 +          expr: "(\n  sum by (job, namespace) (rate(thanos_query_store_apis_dns_failures_total{job=\"\
 +            thanos-querier\"}[5m]))\n/\n  sum by (job, namespace) (rate(thanos_query_store_apis_dns_lookups_total{job=\"\
             thanos-querier\"}[5m]))\n) * 100 > 1\n"
           for: 1h
           labels:
 @@ -2692,26 +2811,28 @@
             syn: 'true'
         - alert: SYN_ThanosQueryHttpRequestQueryErrorRateHigh
           annotations:
 -            description: Thanos Query {{$labels.job}} is failing to handle {{ $value
 -              | humanize }}% of "query" requests.
 +            description: Thanos Query {{$labels.job}} is failing to handle {{$value
 +              | humanize}}% of "query" requests.
             summary: Thanos Query is failing to handle requests.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\
 -            , handler=\"query\"}[5m]))\n/\n  sum(rate(http_requests_total{job=\"thanos-querier\"\
 -            , handler=\"query\"}[5m]))\n) * 100 > 5\n"
 +          expr: "(\n  sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\
 +            , job=\"thanos-querier\", handler=\"query\"}[5m]))\n/\n  sum by (job,\
 +            \ namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\
 +            query\"}[5m]))\n) * 100 > 5\n"
           for: 1h
           labels:
             severity: warning
             syn: 'true'
         - alert: SYN_ThanosQueryHttpRequestQueryRangeErrorRateHigh
           annotations:
 -            description: Thanos Query {{$labels.job}} is failing to handle {{ $value
 -              | humanize }}% of "query_range" requests.
 +            description: Thanos Query {{$labels.job}} is failing to handle {{$value
 +              | humanize}}% of "query_range" requests.
             summary: Thanos Query is failing to handle requests.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum(rate(http_requests_total{code=~\"5..\", job=\"thanos-querier\"\
 -            , handler=\"query_range\"}[5m]))\n/\n  sum(rate(http_requests_total{job=\"\
 -            thanos-querier\", handler=\"query_range\"}[5m]))\n) * 100 > 5\n"
 +          expr: "(\n  sum by (job, namespace) (rate(http_requests_total{code=~\"5..\"\
 +            , job=\"thanos-querier\", handler=\"query_range\"}[5m]))\n/\n  sum by\
 +            \ (job, namespace) (rate(http_requests_total{job=\"thanos-querier\", handler=\"\
 +            query_range\"}[5m]))\n) * 100 > 5\n"
           for: 1h
           labels:
             severity: warning
 @@ -2720,24 +2841,25 @@
       rules:
         - alert: SYN_ThanosNoRuleEvaluations
           annotations:
 -            description: Thanos Rule {{$labels.job}} did not perform any rule evaluations
 -              in the past 2 minutes.
 +            description: Thanos Rule {{$labels.instance}} did not perform any rule
 +              evaluations in the past 10 minutes.
             summary: Thanos Rule did not perform any rule evaluations.
             syn_component: openshift4-monitoring
 -          expr: "sum(rate(prometheus_rule_evaluations_total{job=\"thanos-ruler\"}[2m]))\
 -            \ <= 0\n  and\nsum(thanos_rule_loaded_rules{job=\"thanos-ruler\"}) > 0\n"
 -          for: 3m
 +          expr: "sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\
 +            thanos-ruler\"}[5m])) <= 0\n  and\nsum by (job, instance) (thanos_rule_loaded_rules{job=\"\
 +            thanos-ruler\"}) > 0\n"
 +          for: 5m
           labels:
             severity: warning
             syn: 'true'
         - alert: SYN_ThanosRuleAlertmanagerHighDNSFailures
           annotations:
 -            description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
 +            description: Thanos Rule {{$labels.instance}} has {{$value | humanize}}%
               of failing DNS queries for Alertmanager endpoints.
             summary: Thanos Rule is having high number of DNS failures.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\
 -            thanos-ruler\"}[5m]))\n/\n  sum by (job) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\
 +          expr: "(\n  sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=\"\
 +            thanos-ruler\"}[5m]))\n/\n  sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=\"\
             thanos-ruler\"}[5m]))\n* 100 > 1\n)\n"
           for: 15m
           labels:
 @@ -2749,20 +2871,21 @@
               configuration.
             summary: Thanos Rule has not been able to reload configuration.
             syn_component: openshift4-monitoring
 -          expr: avg(thanos_rule_config_last_reload_successful{job="thanos-ruler"})
 -            by (job) != 1
 +          expr: avg by (job, instance) (thanos_rule_config_last_reload_successful{job="thanos-ruler"})
 +            != 1
           for: 5m
           labels:
             severity: info
             syn: 'true'
         - alert: SYN_ThanosRuleGrpcErrorRate
           annotations:
 -            description: Thanos Rule {{$labels.job}} is failing to handle {{ $value
 -              | humanize }}% of requests.
 +            description: Thanos Rule {{$labels.job}} is failing to handle {{$value
 +              | humanize}}% of requests.
             summary: Thanos Rule is failing to handle grpc requests.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(grpc_server_handled_total{grpc_code=~\"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
 -            , job=\"thanos-ruler\"}[5m]))\n/\n  sum by (job) (rate(grpc_server_started_total{job=\"\
 +          expr: "(\n  sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~\"\
 +            Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"\
 +            , job=\"thanos-ruler\"}[5m]))\n/\n  sum by (job, instance) (rate(grpc_server_started_total{job=\"\
             thanos-ruler\"}[5m]))\n* 100 > 5\n)\n"
           for: 5m
           labels:
 @@ -2770,12 +2893,11 @@
             syn: 'true'
         - alert: SYN_ThanosRuleHighRuleEvaluationFailures
           annotations:
 -            description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
 -              evaluate rules.
 +            description: Thanos Rule {{$labels.instance}} is failing to evaluate rules.
             summary: Thanos Rule is failing to evaluate rules.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=\"\
 -            thanos-ruler\"}[5m]))\n/\n  sum by (job) (rate(prometheus_rule_evaluations_total{job=\"\
 +          expr: "(\n  sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=\"\
 +            thanos-ruler\"}[5m]))\n/\n  sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=\"\
             thanos-ruler\"}[5m]))\n* 100 > 5\n)\n"
           for: 5m
           labels:
 @@ -2783,11 +2905,11 @@
             syn: 'true'
         - alert: SYN_ThanosRuleHighRuleEvaluationWarnings
           annotations:
 -            description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number
 -              of evaluation warnings.
 +            description: Thanos Rule {{$labels.instance}} has high number of evaluation
 +              warnings.
             summary: Thanos Rule has high number of evaluation warnings.
             syn_component: openshift4-monitoring
 -          expr: 'sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m]))
 +          expr: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job="thanos-ruler"}[5m]))
             > 0
 
             '
 @@ -2797,16 +2919,15 @@
             syn: 'true'
         - alert: SYN_ThanosRuleNoEvaluationFor10Intervals
           annotations:
 -            description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
 -              rule groups that did not evaluate for at least 10x of their expected
 -              interval.
 +            description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% rule
 +              groups that did not evaluate for at least 10x of their expected interval.
             summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
             syn_component: openshift4-monitoring
 -          expr: 'time() -  max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"})
 +          expr: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job="thanos-ruler"})
 
             >
 
 -            10 * max by (job, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
 +            10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job="thanos-ruler"})
 
             '
           for: 5m
 @@ -2815,12 +2936,12 @@
             syn: 'true'
         - alert: SYN_ThanosRuleQueryHighDNSFailures
           annotations:
 -            description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}%
 -              of failing DNS queries for query endpoints.
 +            description: Thanos Rule {{$labels.job}} has {{$value | humanize}}% of
 +              failing DNS queries for query endpoints.
             summary: Thanos Rule is having high number of DNS failures.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\
 -            thanos-ruler\"}[5m]))\n/\n  sum by (job) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\
 +          expr: "(\n  sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=\"\
 +            thanos-ruler\"}[5m]))\n/\n  sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=\"\
             thanos-ruler\"}[5m]))\n* 100 > 1\n)\n"
           for: 15m
           labels:
 @@ -2828,11 +2949,11 @@
             syn: 'true'
         - alert: SYN_ThanosRuleQueueIsDroppingAlerts
           annotations:
 -            description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
 -              queue alerts.
 +            description: Thanos Rule {{$labels.instance}} is failing to queue alerts.
 +            runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ThanosRuleQueueIsDroppingAlerts.md
             summary: Thanos Rule is failing to queue alerts.
             syn_component: openshift4-monitoring
 -          expr: 'sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m]))
 +          expr: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job="thanos-ruler"}[5m]))
             > 0
 
             '
 @@ -2842,12 +2963,12 @@
             syn: 'true'
         - alert: SYN_ThanosRuleRuleEvaluationLatencyHigh
           annotations:
 -            description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation
 -              latency than interval for {{$labels.rule_group}}.
 +            description: Thanos Rule {{$labels.instance}} has higher evaluation latency
 +              than interval for {{$labels.rule_group}}.
             summary: Thanos Rule has high rule evaluation latency.
             syn_component: openshift4-monitoring
 -          expr: "(\n  sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\
 -            thanos-ruler\"})\n>\n  sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=\"\
 +          expr: "(\n  sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=\"\
 +            thanos-ruler\"})\n>\n  sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=\"\
             thanos-ruler\"})\n)\n"
           for: 5m
           labels:
 @@ -2855,11 +2976,11 @@
             syn: 'true'
         - alert: SYN_ThanosRuleSenderIsFailingAlerts
           annotations:
 -            description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to
 -              send alerts to alertmanager.
 +            description: Thanos Rule {{$labels.instance}} is failing to send alerts
 +              to alertmanager.
             summary: Thanos Rule is failing to send alerts to alertmanager.
             syn_component: openshift4-monitoring
 -          expr: 'sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m]))
 +          expr: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job="thanos-ruler"}[5m]))
             > 0
 
             '
 @@ -2867,47 +2988,3 @@
           labels:
             severity: warning
             syn: 'true'
 -    - name: syn-thanos-sidecar
 -      rules:
 -        - alert: SYN_ThanosSidecarBucketOperationsFailed
 -          annotations:
 -            description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} bucket operations
 -              are failing
 -            summary: Thanos Sidecar bucket operations are failing
 -            syn_component: openshift4-monitoring
 -          expr: 'rate(thanos_objstore_bucket_operation_failures_total{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}[5m])
 -            > 0
 -
 -            '
 -          for: 1h
 -          labels:
 -            severity: warning
 -            syn: 'true'
 -        - alert: SYN_ThanosSidecarPrometheusDown
 -          annotations:
 -            description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect
 -              to Prometheus.
 -            summary: Thanos Sidecar cannot connect to Prometheus
 -            syn_component: openshift4-monitoring
 -          expr: 'sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}
 -            == 0)
 -
 -            '
 -          for: 1h
 -          labels:
 -            severity: warning
 -            syn: 'true'
 -        - alert: SYN_ThanosSidecarUnhealthy
 -          annotations:
 -            description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy
 -              for {{ $value }} seconds.
 -            summary: Thanos Sidecar is unhealthy.
 -            syn_component: openshift4-monitoring
 -          expr: 'time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"})
 -            by (job,pod) >= 240
 -
 -            '
 -          for: 1h
 -          labels:
 -            severity: warning
 -            syn: 'true'