Last active
October 7, 2024 11:57
-
-
Save pamtrak06/dee4471bd1158e8b32c40fc8a2eb5a75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# OKD 3.x Cluster Health Check Script | |
# =================================== | |
# | |
# This script is designed to perform a comprehensive health check on an OKD 3.x cluster. | |
# It covers various aspects of the cluster including nodes, pods, services, and resources. | |
# | |
# Compatibility: | |
# - Generally suitable for OKD version 3.x | |
# - Uses commands and concepts compatible with OKD 3.x (oc CLI, etcd, router, registry) | |
# - References OKD 3.x specific services (atomic-openshift-master, atomic-openshift-node) | |
# - File paths are consistent with OKD 3.x structure | |
# - Checks resources like quotas, limits, and persistent volumes | |
# | |
# Potential Adjustments for Specific OKD 3.x Versions: | |
# 1. Verify all `oc` commands are available in your target 3.x version | |
# 2. Confirm namespaces (openshift-infra, openshift-node) are correct for your version | |
# 3. Consider adding checks for OKD 3.x specific components: | |
# - SDN (Software Defined Networking) configuration | |
# - Templates and builder images | |
# 4. Ensure file paths and service names match your specific OKD 3.x version | |
# | |
# Note: While this script provides a solid foundation for OKD 3.x health checks, | |
# minor adjustments may be needed for perfect compatibility with specific 3.x versions. | |
# | |
# Usage: ./okd_health_check.sh | |
# | |
# This script will generate both JSON and HTML reports of the cluster health status. | |
# Script starts here | |
# Activer le mode strict pour bash | |
set -euo pipefail | |
# Définir l'IFS pour éviter les problèmes de séparation de mots | |
IFS=$'\n\t' | |
# Colors for better readability | |
RED='\033[0;31m' | |
GREEN='\033[0;32m' | |
YELLOW='\033[1;33m' | |
NC='\033[0m' # No Color | |
# Créer des noms de fichiers uniques et sécurisés | |
JSON_FILE=$(mktemp -t okd_health_report_XXXXXX.json) | |
HTML_FILE=$(mktemp -t okd_health_report_XXXXXX.html) | |
# Fonction pour nettoyer les fichiers temporaires à la sortie du script | |
cleanup() { | |
rm -f "$JSON_FILE" "$HTML_FILE" | |
} | |
trap cleanup EXIT | |
# Fonction pour valider les entrées | |
validate_input() { | |
local input="$1" | |
if [[ ! "$input" =~ ^[a-zA-Z0-9_.-]+$ ]]; then | |
echo "Invalid input: $input" >&2 | |
exit 1 | |
fi | |
} | |
# Fonction pour exécuter des commandes SSH de manière sécurisée | |
safe_ssh() { | |
local node="$1" | |
shift | |
validate_input "$node" | |
ssh -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/path/to/known_hosts "$node" "$@" | |
} | |
# Fonction pour écrire dans le fichier JSON de manière sécurisée | |
write_to_json() { | |
local temp_file | |
temp_file=$(mktemp) | |
jq --arg component "$1" --arg status "$2" --argjson details "$3" \ | |
'. += [{"component": $component, "status": $status, "details": $details}]' "$JSON_FILE" > "$temp_file" | |
mv "$temp_file" "$JSON_FILE" | |
} | |
# Fonction pour afficher des messages | |
print_message() { | |
echo -e "${YELLOW}==== $1 ====${NC}" | |
write_to_json "$1" "Info" '""' | |
} | |
# Fonction pour vérifier le statut et l'afficher en couleur | |
check_status() { | |
if [ "$1" == "True" ] || [ "$1" == "Running" ] || [ "$1" == "active" ]; then | |
echo -e "${GREEN}$1${NC}" | |
write_to_json "$2" "OK" "\"$1\"" | |
else | |
echo -e "${RED}$1${NC}" | |
write_to_json "$2" "Problem" "\"$1\"" | |
fi | |
} | |
# Fonction pour vérifier les prérequis | |
check_prerequisites() { | |
print_message "Checking prerequisites" | |
prerequisites=("jq" "oc" "etcdctl" "openssl") | |
missing_tools=() | |
for tool in "${prerequisites[@]}"; do | |
if ! command -v "$tool" &> /dev/null; then | |
echo -e "${RED}$tool is not installed or not in PATH${NC}" >&2 | |
missing_tools+=("$tool") | |
else | |
echo -e "${GREEN}$tool is installed${NC}" | |
fi | |
done | |
if [ ${#missing_tools[@]} -ne 0 ]; then | |
echo -e "${RED}Please install the missing tools before running this script:${NC}" >&2 | |
for tool in "${missing_tools[@]}"; do | |
echo " - $tool" >&2 | |
done | |
exit 1 | |
fi | |
echo -e "${GREEN}All prerequisites are met${NC}" | |
write_to_json "Prerequisites" "OK" '"All required tools are installed"' | |
} | |
# Vérification des nœuds | |
check_nodes() { | |
print_message "Detailed verification of node status" | |
node_status=$(oc get nodes -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].type,REASON:.status.conditions[-1].reason,MESSAGE:.status.conditions[-1].message) | |
echo "$node_status" | |
node_json=$(oc get nodes -o json | jq '.items[] | {name: .metadata.name, status: .status.conditions[-1].type, reason: .status.conditions[-1].reason, message: .status.conditions[-1].message}') | |
write_to_json "Nodes" "Info" "$node_json" | |
echo "" | |
} | |
# Vérification de la disponibilité des nœuds | |
check_node_readiness() { | |
print_message "Detailed verification of NotReady nodes" | |
not_ready_nodes=$(oc get nodes | awk '$2 == "NotReady" {print $1}') | |
if [ -z "$not_ready_nodes" ]; then | |
echo "All nodes are Ready" | |
write_to_json "Node Readiness" "OK" '"All nodes are Ready"' | |
return | |
fi | |
not_ready_info=() | |
while IFS= read -r node; do | |
echo "Analyzing NotReady node: $node" | |
validate_input "$node" | |
echo "Test: Checking system resources" | |
echo "Explanation: We'll check memory, disk, and CPU usage. High utilization can cause NotReady state." | |
if safe_ssh "$node" "true" &> /dev/null; then | |
mem_usage=$(safe_ssh "$node" "free -m | awk '/^Mem:/ {print \$3/\$2 * 100.0}'") | |
disk_usage=$(safe_ssh "$node" "df -h / | awk 'NR==2 {print \$5}' | tr -d '%'") | |
cpu_usage=$(safe_ssh "$node" "top -bn1 | awk '/^%Cpu/ {print \$2 + \$4}'") | |
echo "Results: Memory: ${mem_usage}%, Disk: ${disk_usage}%, CPU: ${cpu_usage}%" | |
node_info=$(jq -n \ | |
--arg node "$node" \ | |
--arg mem "$mem_usage" \ | |
--arg disk "$disk_usage" \ | |
--arg cpu "$cpu_usage" \ | |
'{node: $node, memory: $mem, disk: $disk, cpu: $cpu}') | |
not_ready_info+=("$node_info") | |
else | |
echo "ERROR: Unable to connect to the node via SSH" >&2 | |
not_ready_info+=("$(jq -n --arg node "$node" '{node: $node, error: "Unable to SSH"}')") | |
fi | |
echo "--------------------" | |
done <<< "$not_ready_nodes" | |
write_to_json "NotReady Nodes" "Warning" "$(jq -n --argjson arr "$(printf '%s\n' "${not_ready_info[@]}" | jq -s '.')" '$arr')" | |
} | |
# Vérification des pods système | |
check_system_pods() { | |
print_message "Detailed verification of system pods" | |
system_pods_info=() | |
for ns in default openshift-infra openshift-node kube-system; do | |
echo "Namespace: $ns" | |
pods_json=$(oc get pods -n "$ns" -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}') | |
system_pods_info+=("$pods_json") | |
echo "$pods_json" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)' | |
echo "" | |
done | |
write_to_json "System Pods" "Info" "$(jq -s '.' <<< "${system_pods_info[@]}")" | |
} | |
# Vérification des services clés | |
check_key_services() { | |
print_message "Detailed verification of key services" | |
services_info=() | |
for service in atomic-openshift-master atomic-openshift-node docker; do | |
echo "Service: $service" | |
status=$(systemctl is-active "$service") | |
echo -n "Status: " | |
check_status "$status" "Service $service" | |
service_details=$(systemctl status "$service" --no-pager) | |
active_status=$(echo "$service_details" | grep "Active:" | sed 's/^ *Active: //') | |
memory_usage=$(echo "$service_details" | grep "Memory:" | sed 's/^ *Memory: //') | |
tasks=$(echo "$service_details" | grep "Tasks:" | sed 's/^ *Tasks: //') | |
echo "Active: $active_status" | |
echo "Memory: $memory_usage" | |
echo "Tasks: $tasks" | |
echo "" | |
service_info=$(jq -n \ | |
--arg name "$service" \ | |
--arg status "$status" \ | |
--arg active "$active_status" \ | |
--arg memory "$memory_usage" \ | |
--arg tasks "$tasks" \ | |
'{name: $name, status: $status, active: $active, memory: $memory, tasks: $tasks}') | |
services_info+=("$service_info") | |
done | |
write_to_json "Key Services" "Info" "$(jq -s '.' <<< "${services_info[@]}")" | |
} | |
# Vérification d'etcd | |
check_etcd() { | |
print_message "Detailed verification of etcd" | |
source /etc/etcd/etcd.conf | |
etcd_health=$(etcdctl --cert-file="$ETCD_PEER_CERT_FILE" --key-file="$ETCD_PEER_KEY_FILE" \ | |
--ca-file=/etc/etcd/ca.crt --endpoints="$ETCD_LISTEN_CLIENT_URLS" cluster-health) | |
echo "$etcd_health" | |
etcd_members=$(etcdctl --cert-file="$ETCD_PEER_CERT_FILE" --key-file="$ETCD_PEER_KEY_FILE" \ | |
--ca-file=/etc/etcd/ca.crt --endpoints="$ETCD_LISTEN_CLIENT_URLS" member list) | |
echo "etcd members:" | |
echo "$etcd_members" | |
etcd_info=$(jq -n \ | |
--arg health "$etcd_health" \ | |
--arg members "$etcd_members" \ | |
'{health: $health, members: $members}') | |
write_to_json "etcd" "Info" "$etcd_info" | |
echo "" | |
} | |
# Vérification des logs du master | |
check_master_logs() { | |
print_message "Checking master logs (last errors)" | |
errors=$(journalctl -u atomic-openshift-master -p err --no-pager -n 50) | |
if [ -z "$errors" ]; then | |
write_to_json "Master Logs" "OK" '"No recent errors"' | |
else | |
write_to_json "Master Logs" "Warning" "$(jq -R -s -c 'split("\n")' <<< "$errors")" | |
fi | |
echo "$errors" | |
echo "" | |
} | |
# Vérification de l'utilisation des ressources | |
check_resource_usage() { | |
print_message "Detailed verification of resource usage" | |
# Top 5 nodes by CPU usage | |
echo "Top 5 nodes by CPU usage:" | |
cpu_usage=$(oc adm top nodes | sort -k3 -rn | head -n 6) | |
echo "$cpu_usage" | |
# Top 5 nodes by memory usage | |
echo "Top 5 nodes by memory usage:" | |
mem_usage=$(oc adm top nodes | sort -k5 -rn | head -n 6) | |
echo "$mem_usage" | |
# Top 10 pods by CPU usage | |
echo "Top 10 pods by CPU usage:" | |
pod_cpu_usage=$(oc adm top pods --all-namespaces | sort -k3 -rn | head -n 11) | |
echo "$pod_cpu_usage" | |
# Top 10 pods by memory usage | |
echo "Top 10 pods by memory usage:" | |
pod_mem_usage=$(oc adm top pods --all-namespaces | sort -k4 -rn | head -n 11) | |
echo "$pod_mem_usage" | |
# Add to JSON | |
resource_usage=$(jq -n \ | |
--argjson node_cpu "$(echo "$cpu_usage" | awk 'NR>1 {print "{\"node\":\"" $1 "\", \"cpu\":\"" $2 "\", \"cpu_percent\":\"" $3 "\"}"}')" \ | |
--argjson node_mem "$(echo "$mem_usage" | awk 'NR>1 {print "{\"node\":\"" $1 "\", \"memory\":\"" $4 "\", \"memory_percent\":\"" $5 "\"}"}')" \ | |
--argjson pod_cpu "$(echo "$pod_cpu_usage" | awk 'NR>1 {print "{\"namespace\":\"" $1 "\", \"pod\":\"" $2 "\", \"cpu\":\"" $3 "\"}"}')" \ | |
--argjson pod_mem "$(echo "$pod_mem_usage" | awk 'NR>1 {print "{\"namespace\":\"" $1 "\", \"pod\":\"" $2 "\", \"memory\":\"" $4 "\"}"}')" \ | |
'{node_cpu: $node_cpu, node_memory: $node_mem, pod_cpu: $pod_cpu, pod_memory: $pod_mem}') | |
write_to_json "Resource Usage" "Info" "$resource_usage" | |
echo "" | |
} | |
# Vérification du routeur | |
check_router() { | |
print_message "Detailed verification of the router" | |
router_dc=$(oc describe dc/router -n default) | |
echo "$router_dc" | |
echo "" | |
router_status=$(oc get pods -n default -l router=router -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}') | |
echo "Router status:" | |
echo "$router_status" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)' | |
write_to_json "Router" "Info" "$router_status" | |
echo "" | |
} | |
# Vérification du registre | |
check_registry() { | |
print_message "Detailed verification of the registry" | |
registry_dc=$(oc describe dc/docker-registry -n default) | |
echo "$registry_dc" | |
echo "" | |
registry_status=$(oc get pods -n default -l deploymentconfig=docker-registry -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}') | |
echo "Registry status:" | |
echo "$registry_status" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)' | |
write_to_json "Registry" "Info" "$registry_status" | |
echo "" | |
} | |
# Vérification des quotas et limites | |
check_quotas_and_limits() { | |
print_message "Verification of quotas and limits" | |
# Get quotas | |
quotas=$(oc get quota --all-namespaces -o json) | |
echo "Cluster Quotas:" | |
echo "$quotas" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Used: \(.status.used), Hard: \(.status.hard)"' | |
# Get limit ranges | |
limitranges=$(oc get limitrange --all-namespaces -o json) | |
echo "Limit Ranges:" | |
echo "$limitranges" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Limits: \(.spec.limits)"' | |
# Add to JSON | |
quotas_and_limits=$(jq -n \ | |
--argjson quotas "$quotas" \ | |
--argjson limitranges "$limitranges" \ | |
'{quotas: $quotas, limitranges: $limitranges}') | |
write_to_json "Quotas and Limits" "Info" "$quotas_and_limits" | |
echo "" | |
} | |
# Vérification du stockage persistant | |
check_storage() { | |
print_message "Verification of persistent storage" | |
pv_status=$(oc get pv -o json) | |
pvc_status=$(oc get pvc --all-namespaces -o json) | |
echo "Persistent Volumes:" | |
echo "$pv_status" | jq -r '.items[] | "Name: \(.metadata.name), Capacity: \(.spec.capacity.storage), Status: \(.status.phase)"' | |
echo "" | |
echo "Persistent Volume Claims:" | |
echo "$pvc_status" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Status: \(.status.phase), Volume: \(.spec.volumeName)"' | |
storage_info=$(jq -n \ | |
--argjson pv "$pv_status" \ | |
--argjson pvc "$pvc_status" \ | |
'{persistent_volumes: $pv, persistent_volume_claims: $pvc}') | |
write_to_json "Persistent Storage" "Info" "$storage_info" | |
echo "" | |
} | |
# Vérification des certificats | |
check_certificates() { | |
print_message "Detailed verification of certificates" | |
# Check pending CSRs | |
pending_csr=$(oc get csr -o json | jq '.items[] | select(.status == null)') | |
pending_csr_count=$(echo "$pending_csr" | jq 'length') | |
echo "Pending Certificate Signing Requests (CSRs): $pending_csr_count" | |
# Check main certificates | |
cert_files=( | |
"/etc/origin/master/master.server.crt" | |
"/etc/origin/master/etcd.server.crt" | |
"/etc/origin/master/ca.crt" | |
"/etc/origin/node/server.crt" | |
) | |
cert_names=( | |
"Master Server" | |
"etcd Server" | |
"Cluster CA" | |
"Node Server" | |
) | |
cert_info=() | |
for i in "${!cert_files[@]}"; do | |
if [ -f "${cert_files[$i]}" ]; then | |
expiry_date=$(openssl x509 -in "${cert_files[$i]}" -noout -enddate | cut -d= -f2) | |
expiry_epoch=$(date -d "$expiry_date" +%s) | |
current_epoch=$(date +%s) | |
days_left=$(( ($expiry_epoch - $current_epoch) / 86400 )) | |
echo "Certificate: ${cert_names[$i]}" | |
echo "Expires on: $expiry_date" | |
echo "Days left: $days_left" | |
cert_status="OK" | |
if [ $days_left -lt 30 ]; then | |
echo -e "${RED}ALERT: Certificate expires in less than 30 days!${NC}" | |
cert_status="Critical" | |
elif [ $days_left -lt 90 ]; then | |
echo -e "${YELLOW}WARNING: Certificate expires in less than 90 days.${NC}" | |
cert_status="Warning" | |
else | |
echo -e "${GREEN}OK: Certificate is valid for more than 90 days.${NC}" | |
fi | |
cert_info+=("$(jq -n \ | |
--arg name "${cert_names[$i]}" \ | |
--arg expiry "$expiry_date" \ | |
--arg days_left "$days_left" \ | |
--arg status "$cert_status" \ | |
'{name: $name, expiry: $expiry, days_left: $days_left | tonumber, status: $status}')") | |
echo "" | |
else | |
echo "File ${cert_files[$i]} does not exist." | |
cert_info+=("$(jq -n \ | |
--arg name "${cert_names[$i]}" \ | |
'{name: $name, status: "Error", message: "File not found"}')") | |
fi | |
done | |
certificates_data=$(jq -n \ | |
--argjson pending "$pending_csr" \ | |
--argjson certs "$(printf '%s\n' "${cert_info[@]}" | jq -s '.')" \ | |
'{pending_csr: $pending, certificates: $certs}') | |
write_to_json "Certificates" "Info" "$certificates_data" | |
} | |
# Génération du rapport HTML | |
generate_html_report() { | |
cat << EOF > "$HTML_FILE" | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>OKD Cluster Health Report</title> | |
<style> | |
body { font-family: Arial, sans-serif; line-height: 1.6; color: #333; max-width: 800px; margin: 0 auto; padding: 20px; } | |
h1 { color: #2c3e50; } | |
table { width: 100%; border-collapse: collapse; margin-bottom: 20px; } | |
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; } | |
th { background-color: #f2f2f2; } | |
.ok { color: green; } | |
.problem { color: red; } | |
.warning { color: orange; } | |
.info { color: blue; } | |
</style> | |
</head> | |
<body> | |
<h1>OKD Cluster Health Report</h1> | |
<table> | |
<tr> | |
<th>Component</th> | |
<th>Status</th> | |
<th>Details</th> | |
</tr> | |
EOF | |
jq -r '.[] | "<tr><td>\(.component)</td><td class=\"\(.status | ascii_downcase)\">\(.status)</td><td>\(.details)</td></tr>"' "$JSON_FILE" >> "$HTML_FILE" | |
echo "</table></body></html>" >> "$HTML_FILE" | |
} | |
# Fonction principale | |
main() { | |
# Initialiser le fichier JSON | |
echo "[]" > "$JSON_FILE" | |
# Exécuter les vérifications | |
check_prerequisites | |
check_nodes | |
check_node_readiness | |
check_system_pods | |
check_key_services | |
check_etcd | |
check_master_logs | |
check_resource_usage | |
check_router | |
check_registry | |
check_quotas_and_limits | |
check_storage | |
check_certificates | |
# Générer le rapport HTML | |
generate_html_report | |
echo "JSON report generated: $JSON_FILE" | |
echo "HTML report generated: $HTML_FILE" | |
# Définir les permissions appropriées pour les fichiers de sortie | |
chmod 600 "$JSON_FILE" "$HTML_FILE" | |
} | |
# Exécuter la fonction principale | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment