Created
April 12, 2022 17:33
-
-
Save mattmattox/9d442945a40e7c0b06de875a6e14cf6c to your computer and use it in GitHub Desktop.
Custom PrometheusRule to pageout to Slack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: monitoring.coreos.com/v1 | |
kind: PrometheusRule | |
metadata: | |
annotations: | |
meta.helm.sh/release-name: monitoring | |
meta.helm.sh/release-namespace: monitoring | |
prometheus-operator-validated: "true" | |
labels: | |
app: kube-prometheus-stack | |
app.kubernetes.io/instance: monitoring | |
app.kubernetes.io/managed-by: Helm | |
app.kubernetes.io/part-of: kube-prometheus-stack | |
app.kubernetes.io/version: 34.9.0 | |
chart: kube-prometheus-stack-34.9.0 | |
cluster: a-rke2-devops | |
heritage: Helm | |
release: monitoring | |
name: monitoring-kube-prometheus-slack.rules | |
namespace: monitoring | |
spec: | |
groups: | |
- name: slack.rules | |
rules: | |
- alert: HostOutOfMemory | |
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostMemoryUnderMemoryPressure | |
expr: 'rate(node_vmstat_pgmajfault[1m]) > 1000' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualNetworkThroughputIn | |
expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualNetworkThroughputOut | |
expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualDiskReadRate | |
expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50' | |
for: 5m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualDiskWriteRate | |
expr: 'sum by (instance) (rate(node_disk_write_bytes_total[2m])) / 1024 / 1024 > 50' | |
for: 5m | |
labels: | |
severity: slack-alert | |
- alert: HostOutOfDiskSpace | |
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostDiskWillFillIn24Hours | |
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostDiskWillFillIn48Hours | |
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 48 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostOutOfInodes | |
expr: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualDiskReadLatency | |
expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostUnusualDiskWriteLatency | |
expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostHighCpuLoad | |
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostCpuStealNoisyNeighbor | |
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: HostContextSwitching | |
expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeNotReady | |
expr: 'kube_node_status_condition{condition="Ready",status="false"} == 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeNotSchedulable | |
expr: 'kube_node_status_condition{condition="Schedulable",status="false"} == 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeMemoryPressure | |
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeDiskPressure | |
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeNetworkUnavailable | |
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesNodeOutOfDisk | |
expr: 'kube_node_status_condition{condition="OutOfDisk",status="true"} == 1' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesOutOfCapacity | |
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesContainerOomKiller | |
expr: 'kube_container_status_last_seen_seconds_ago{state="OOMKilled",container=~"/kubelet.*"} > 10' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPersistentVolumeClaimPending | |
expr: 'kube_persistentvolumeclaim_info{status="Pending"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPersistentVolumeClaimLost | |
expr: 'kube_persistentvolumeclaim_info{status="Lost"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPersistentVolumeClaimFailed | |
expr: 'kube_persistentvolumeclaim_info{status="Failed"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesVolumeOutOfDiskSpace | |
expr: 'kube_persistentvolume_info{status="OutOfDisk"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesVolumeFailed | |
expr: 'kube_persistentvolume_info{status="Failed"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesVolumeLost | |
expr: 'kube_persistentvolume_info{status="Lost"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesVolumePending | |
expr: 'kube_persistentvolume_info{status="Pending"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesVolumeOutOfDiskSpace | |
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesStatefulSetDown | |
expr: 'kube_statefulset_status_replicas{status="Failed"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesStatefulSetPending | |
expr: 'kube_statefulset_status_replicas{status="Pending"} > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPodNotHealthy | |
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[15m:1m]) > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPodCrashLooping | |
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*",reason="CrashLooping"})[15m:1m]) > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesPodRestarting | |
expr: 'min_over_time(sum by (namespace, pod) (kube_pod_container_status_restarts_total{container=~"kube-.*"})[15m:1m]) > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesReplicasSetMismatch | |
expr: 'kube_replicationcontroller_status_replicas != kube_replicationcontroller_status_replicas_current' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDeploymentReplicasMismatch | |
expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDeploymentFailed | |
expr: 'kube_deployment_status_replicas != kube_deployment_status_replicas_current and kube_deployment_status_replicas_available == 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesStatefulSetReplicasMismatch | |
expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesStatefulSetFailed | |
expr: 'kube_statefulset_status_replicas != kube_statefulset_status_replicas_current and kube_statefulset_status_replicas_available == 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDaemonSetReplicasMismatch | |
expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDaemonSetFailed | |
expr: 'kube_daemonset_status_replicas != kube_daemonset_status_replicas_current and kube_daemonset_status_replicas_available == 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesJobFailed | |
expr: 'kube_job_status_failed > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDeploymentGenerationMismatch | |
expr: 'kube_deployment_status_observed_generation != kube_deployment_status_replicas_current_generation' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesStatefulSetUpdateNotRolledOut | |
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDaemonSetRolloutStuck | |
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' | |
for: 10m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesDaemonSetMisscheduled | |
expr: 'kube_daemonset_status_number_misscheduled > 0' | |
for: 1m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesCronJobTooLong | |
expr: 'time() - kube_cronjob_next_schedule_time > 3600' | |
for: 1m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesJobSlowCompletion | |
expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0' | |
for: 12h | |
labels: | |
severity: slack-alert | |
- alert: KubernetesApiServerErrors | |
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesApiClientErrors | |
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1' | |
for: 2m | |
labels: | |
severity: slack-alert | |
- alert: KubernetesClientCertificateExpiresNextWeek | |
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' | |
labels: | |
severity: slack-alert | |
- alert: KubernetesClientCertificateExpiresSoon | |
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' | |
labels: | |
severity: slack-alert | |
- alert: KubernetesClientCertificateExpired | |
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 0' | |
labels: | |
severity: slack-alert | |
- alert: KubernetesApiSserverLatency | |
expr: 'apiserver_request_duration_seconds_sum{job="apiserver"} > 0 and histogram_quantile(0.99, sum by (job, le) (rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m]))) > 0.5' | |
labels: | |
severity: slack-alert |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment