Last active
June 2, 2020 19:14
-
-
Save rvanbutselaar/8e1d569478a1779c18800d0a97fd22c3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Docker daemon | |
systemctl is-active docker | |
# Check that Docker volume group has adequate space | |
echo $(echo \"$(docker info 2>/dev/null | awk '/Data Space Available/ {print $4}') / $(docker info 2>/dev/null | awk '/Data Space Total/ {print $4}')\" | bc -l) '>' 0.3 | bc -l | |
# Check that Docker volume group has adequate metadata space | |
echo $(echo \"$(docker info 2>/dev/null | awk '/Metadata Space Available/ {print $4}') / $(docker info 2>/dev/null | awk '/Metadata Space Total/ {print $4}')\" | bc -l) '>' 0.3 | bc -l | |
# etcd is active | |
systemctl is-active etcd | |
# etcd volume is not too full | |
echo "$(lvs | awk '/etcd/ {print $5}') > 70" | bc | |
# Master API service is active | |
master:433/healthz | |
systemctl is-active atomic-openshift-master | |
# Master API service is active (multi-master) | |
systemctl is-active atomic-openshift-master-api | |
# Master controller service is active (multi-master) | |
systemctl is-active atomic-openshift-master-controller | |
# Node service is active | |
systemctl is-active atomic-openshift-node | |
# Node’s local data storage volume is not too full | |
echo "$(lvs | awk '/origin/ {print $5}') > 70" | bc | |
# openvswitch service is active | |
systemctl is-active openvswitch | |
# OpenShift Components | |
Service bus | |
Routers | |
Registry | |
DNSmasq | |
# Logging | |
Elasticsearch | |
Kibana | |
Fluentd | |
# Metrics | |
Hawkular | |
Heapster | |
Cassandra | |
# Health of master API endpoint | |
curl -H "Authorization: Bearer $(oc whoami -t)" https://<my_cluster_api>:8443/healthz | grep ok | |
# Health of router | |
curl http://router.default.svc.cluster.local:1936/healthz | grep 200 | |
# Health of registry | |
curl -I https://docker-registry.default.svc.cluster.local:5000/healthz | grep 200 | |
# Health of EFK logging stack | |
https://github.com/redhat-cop/openshift-toolkit/blob/master/health_check/elasticsearch-health-check-ocp34.sh | |
# Health of metrics stack | |
https://github.com/redhat-cop/openshift-toolkit/blob/master/health_check/metrics-health-check.sh | |
# CPU usage | |
Requests | |
Load | |
# Memory usage | |
Requests | |
Used | |
Memory reserved Total in cluster (aggregate over all nodes) | |
# | |
# Promethius | |
# | |
https://github.com/wkulhanek/openshift-prometheus | |
# Number of cores each machine in cluster has | |
machine_cpu_cores | |
# Total number of cores in cluster | |
sum(machine_cpu_cores) | |
# Percentage of total cluster CPU in use | |
sum(container_memory_rss) / sum(machine_memory_bytes) | |
# Percentage of total cluster memory in use | |
sum(container_memory_rss) / sum(machine_memory_bytes) | |
# Total number of consumed cores in cluster: | |
sum(sort_desc(rate(container_cpu_usage_seconds_total{id="/"}[5m]))) | |
# Number of containers that start or restart over previous 10 minutes | |
sum(changes(container_start_time_seconds[10m])) | |
# Number of mutating API requests being made to control plane | |
sort_desc(drop_common_labels(sum without (instance,type,code) (rate(apiserver_request_count{verb=~"POST|PUT|DELETE|PATCH"}[5m])))) | |
# Number of non-mutating API requests being made to control plane | |
sort_desc(drop_common_labels(sum without (instance,type,code) (rate(apiserver_request_count{verb=~"GET|LIST|WATCH"}[5m])))) | |
# Top 10 pods doing most receive network traffic | |
topk(10, (sum by (pod_name) (rate(container_network_receive_bytes_total[5m])))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment