Skip to content

Instantly share code, notes, and snippets.

@pamtrak06
Last active October 7, 2024 11:57
Show Gist options
  • Save pamtrak06/dee4471bd1158e8b32c40fc8a2eb5a75 to your computer and use it in GitHub Desktop.
Save pamtrak06/dee4471bd1158e8b32c40fc8a2eb5a75 to your computer and use it in GitHub Desktop.
#!/bin/bash
# OKD 3.x Cluster Health Check Script
# ===================================
#
# This script is designed to perform a comprehensive health check on an OKD 3.x cluster.
# It covers various aspects of the cluster including nodes, pods, services, and resources.
#
# Compatibility:
# - Generally suitable for OKD version 3.x
# - Uses commands and concepts compatible with OKD 3.x (oc CLI, etcd, router, registry)
# - References OKD 3.x specific services (atomic-openshift-master, atomic-openshift-node)
# - File paths are consistent with OKD 3.x structure
# - Checks resources like quotas, limits, and persistent volumes
#
# Potential Adjustments for Specific OKD 3.x Versions:
# 1. Verify all `oc` commands are available in your target 3.x version
# 2. Confirm namespaces (openshift-infra, openshift-node) are correct for your version
# 3. Consider adding checks for OKD 3.x specific components:
# - SDN (Software Defined Networking) configuration
# - Templates and builder images
# 4. Ensure file paths and service names match your specific OKD 3.x version
#
# Note: While this script provides a solid foundation for OKD 3.x health checks,
# minor adjustments may be needed for perfect compatibility with specific 3.x versions.
#
# Usage: ./okd_health_check.sh
#
# This script will generate both JSON and HTML reports of the cluster health status.
# Script starts here
# Activer le mode strict pour bash
set -euo pipefail
# Définir l'IFS pour éviter les problèmes de séparation de mots
IFS=$'\n\t'
# Colors for better readability
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Créer des noms de fichiers uniques et sécurisés
JSON_FILE=$(mktemp -t okd_health_report_XXXXXX.json)
HTML_FILE=$(mktemp -t okd_health_report_XXXXXX.html)
# Fonction pour nettoyer les fichiers temporaires à la sortie du script
cleanup() {
rm -f "$JSON_FILE" "$HTML_FILE"
}
trap cleanup EXIT
# Fonction pour valider les entrées
validate_input() {
local input="$1"
if [[ ! "$input" =~ ^[a-zA-Z0-9_.-]+$ ]]; then
echo "Invalid input: $input" >&2
exit 1
fi
}
# Fonction pour exécuter des commandes SSH de manière sécurisée
safe_ssh() {
local node="$1"
shift
validate_input "$node"
ssh -o StrictHostKeyChecking=yes -o UserKnownHostsFile=/path/to/known_hosts "$node" "$@"
}
# Fonction pour écrire dans le fichier JSON de manière sécurisée
write_to_json() {
local temp_file
temp_file=$(mktemp)
jq --arg component "$1" --arg status "$2" --argjson details "$3" \
'. += [{"component": $component, "status": $status, "details": $details}]' "$JSON_FILE" > "$temp_file"
mv "$temp_file" "$JSON_FILE"
}
# Fonction pour afficher des messages
print_message() {
echo -e "${YELLOW}==== $1 ====${NC}"
write_to_json "$1" "Info" '""'
}
# Fonction pour vérifier le statut et l'afficher en couleur
check_status() {
if [ "$1" == "True" ] || [ "$1" == "Running" ] || [ "$1" == "active" ]; then
echo -e "${GREEN}$1${NC}"
write_to_json "$2" "OK" "\"$1\""
else
echo -e "${RED}$1${NC}"
write_to_json "$2" "Problem" "\"$1\""
fi
}
# Fonction pour vérifier les prérequis
check_prerequisites() {
print_message "Checking prerequisites"
prerequisites=("jq" "oc" "etcdctl" "openssl")
missing_tools=()
for tool in "${prerequisites[@]}"; do
if ! command -v "$tool" &> /dev/null; then
echo -e "${RED}$tool is not installed or not in PATH${NC}" >&2
missing_tools+=("$tool")
else
echo -e "${GREEN}$tool is installed${NC}"
fi
done
if [ ${#missing_tools[@]} -ne 0 ]; then
echo -e "${RED}Please install the missing tools before running this script:${NC}" >&2
for tool in "${missing_tools[@]}"; do
echo " - $tool" >&2
done
exit 1
fi
echo -e "${GREEN}All prerequisites are met${NC}"
write_to_json "Prerequisites" "OK" '"All required tools are installed"'
}
# Vérification des nœuds
check_nodes() {
print_message "Detailed verification of node status"
node_status=$(oc get nodes -o custom-columns=NAME:.metadata.name,STATUS:.status.conditions[-1].type,REASON:.status.conditions[-1].reason,MESSAGE:.status.conditions[-1].message)
echo "$node_status"
node_json=$(oc get nodes -o json | jq '.items[] | {name: .metadata.name, status: .status.conditions[-1].type, reason: .status.conditions[-1].reason, message: .status.conditions[-1].message}')
write_to_json "Nodes" "Info" "$node_json"
echo ""
}
# Vérification de la disponibilité des nœuds
check_node_readiness() {
print_message "Detailed verification of NotReady nodes"
not_ready_nodes=$(oc get nodes | awk '$2 == "NotReady" {print $1}')
if [ -z "$not_ready_nodes" ]; then
echo "All nodes are Ready"
write_to_json "Node Readiness" "OK" '"All nodes are Ready"'
return
fi
not_ready_info=()
while IFS= read -r node; do
echo "Analyzing NotReady node: $node"
validate_input "$node"
echo "Test: Checking system resources"
echo "Explanation: We'll check memory, disk, and CPU usage. High utilization can cause NotReady state."
if safe_ssh "$node" "true" &> /dev/null; then
mem_usage=$(safe_ssh "$node" "free -m | awk '/^Mem:/ {print \$3/\$2 * 100.0}'")
disk_usage=$(safe_ssh "$node" "df -h / | awk 'NR==2 {print \$5}' | tr -d '%'")
cpu_usage=$(safe_ssh "$node" "top -bn1 | awk '/^%Cpu/ {print \$2 + \$4}'")
echo "Results: Memory: ${mem_usage}%, Disk: ${disk_usage}%, CPU: ${cpu_usage}%"
node_info=$(jq -n \
--arg node "$node" \
--arg mem "$mem_usage" \
--arg disk "$disk_usage" \
--arg cpu "$cpu_usage" \
'{node: $node, memory: $mem, disk: $disk, cpu: $cpu}')
not_ready_info+=("$node_info")
else
echo "ERROR: Unable to connect to the node via SSH" >&2
not_ready_info+=("$(jq -n --arg node "$node" '{node: $node, error: "Unable to SSH"}')")
fi
echo "--------------------"
done <<< "$not_ready_nodes"
write_to_json "NotReady Nodes" "Warning" "$(jq -n --argjson arr "$(printf '%s\n' "${not_ready_info[@]}" | jq -s '.')" '$arr')"
}
# Vérification des pods système
check_system_pods() {
print_message "Detailed verification of system pods"
system_pods_info=()
for ns in default openshift-infra openshift-node kube-system; do
echo "Namespace: $ns"
pods_json=$(oc get pods -n "$ns" -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}')
system_pods_info+=("$pods_json")
echo "$pods_json" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)'
echo ""
done
write_to_json "System Pods" "Info" "$(jq -s '.' <<< "${system_pods_info[@]}")"
}
# Vérification des services clés
check_key_services() {
print_message "Detailed verification of key services"
services_info=()
for service in atomic-openshift-master atomic-openshift-node docker; do
echo "Service: $service"
status=$(systemctl is-active "$service")
echo -n "Status: "
check_status "$status" "Service $service"
service_details=$(systemctl status "$service" --no-pager)
active_status=$(echo "$service_details" | grep "Active:" | sed 's/^ *Active: //')
memory_usage=$(echo "$service_details" | grep "Memory:" | sed 's/^ *Memory: //')
tasks=$(echo "$service_details" | grep "Tasks:" | sed 's/^ *Tasks: //')
echo "Active: $active_status"
echo "Memory: $memory_usage"
echo "Tasks: $tasks"
echo ""
service_info=$(jq -n \
--arg name "$service" \
--arg status "$status" \
--arg active "$active_status" \
--arg memory "$memory_usage" \
--arg tasks "$tasks" \
'{name: $name, status: $status, active: $active, memory: $memory, tasks: $tasks}')
services_info+=("$service_info")
done
write_to_json "Key Services" "Info" "$(jq -s '.' <<< "${services_info[@]}")"
}
# Vérification d'etcd
check_etcd() {
print_message "Detailed verification of etcd"
source /etc/etcd/etcd.conf
etcd_health=$(etcdctl --cert-file="$ETCD_PEER_CERT_FILE" --key-file="$ETCD_PEER_KEY_FILE" \
--ca-file=/etc/etcd/ca.crt --endpoints="$ETCD_LISTEN_CLIENT_URLS" cluster-health)
echo "$etcd_health"
etcd_members=$(etcdctl --cert-file="$ETCD_PEER_CERT_FILE" --key-file="$ETCD_PEER_KEY_FILE" \
--ca-file=/etc/etcd/ca.crt --endpoints="$ETCD_LISTEN_CLIENT_URLS" member list)
echo "etcd members:"
echo "$etcd_members"
etcd_info=$(jq -n \
--arg health "$etcd_health" \
--arg members "$etcd_members" \
'{health: $health, members: $members}')
write_to_json "etcd" "Info" "$etcd_info"
echo ""
}
# Vérification des logs du master
check_master_logs() {
print_message "Checking master logs (last errors)"
errors=$(journalctl -u atomic-openshift-master -p err --no-pager -n 50)
if [ -z "$errors" ]; then
write_to_json "Master Logs" "OK" '"No recent errors"'
else
write_to_json "Master Logs" "Warning" "$(jq -R -s -c 'split("\n")' <<< "$errors")"
fi
echo "$errors"
echo ""
}
# Vérification de l'utilisation des ressources
check_resource_usage() {
print_message "Detailed verification of resource usage"
# Top 5 nodes by CPU usage
echo "Top 5 nodes by CPU usage:"
cpu_usage=$(oc adm top nodes | sort -k3 -rn | head -n 6)
echo "$cpu_usage"
# Top 5 nodes by memory usage
echo "Top 5 nodes by memory usage:"
mem_usage=$(oc adm top nodes | sort -k5 -rn | head -n 6)
echo "$mem_usage"
# Top 10 pods by CPU usage
echo "Top 10 pods by CPU usage:"
pod_cpu_usage=$(oc adm top pods --all-namespaces | sort -k3 -rn | head -n 11)
echo "$pod_cpu_usage"
# Top 10 pods by memory usage
echo "Top 10 pods by memory usage:"
pod_mem_usage=$(oc adm top pods --all-namespaces | sort -k4 -rn | head -n 11)
echo "$pod_mem_usage"
# Add to JSON
resource_usage=$(jq -n \
--argjson node_cpu "$(echo "$cpu_usage" | awk 'NR>1 {print "{\"node\":\"" $1 "\", \"cpu\":\"" $2 "\", \"cpu_percent\":\"" $3 "\"}"}')" \
--argjson node_mem "$(echo "$mem_usage" | awk 'NR>1 {print "{\"node\":\"" $1 "\", \"memory\":\"" $4 "\", \"memory_percent\":\"" $5 "\"}"}')" \
--argjson pod_cpu "$(echo "$pod_cpu_usage" | awk 'NR>1 {print "{\"namespace\":\"" $1 "\", \"pod\":\"" $2 "\", \"cpu\":\"" $3 "\"}"}')" \
--argjson pod_mem "$(echo "$pod_mem_usage" | awk 'NR>1 {print "{\"namespace\":\"" $1 "\", \"pod\":\"" $2 "\", \"memory\":\"" $4 "\"}"}')" \
'{node_cpu: $node_cpu, node_memory: $node_mem, pod_cpu: $pod_cpu, pod_memory: $pod_mem}')
write_to_json "Resource Usage" "Info" "$resource_usage"
echo ""
}
# Vérification du routeur
check_router() {
print_message "Detailed verification of the router"
router_dc=$(oc describe dc/router -n default)
echo "$router_dc"
echo ""
router_status=$(oc get pods -n default -l router=router -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}')
echo "Router status:"
echo "$router_status" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)'
write_to_json "Router" "Info" "$router_status"
echo ""
}
# Vérification du registre
check_registry() {
print_message "Detailed verification of the registry"
registry_dc=$(oc describe dc/docker-registry -n default)
echo "$registry_dc"
echo ""
registry_status=$(oc get pods -n default -l deploymentconfig=docker-registry -o json | jq '.items[] | {name: .metadata.name, status: .status.phase, restarts: .status.containerStatuses[0].restartCount}')
echo "Registry status:"
echo "$registry_status" | jq -r '.name + " - Status: " + .status + ", Restarts: " + (.restarts | tostring)'
write_to_json "Registry" "Info" "$registry_status"
echo ""
}
# Vérification des quotas et limites
check_quotas_and_limits() {
print_message "Verification of quotas and limits"
# Get quotas
quotas=$(oc get quota --all-namespaces -o json)
echo "Cluster Quotas:"
echo "$quotas" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Used: \(.status.used), Hard: \(.status.hard)"'
# Get limit ranges
limitranges=$(oc get limitrange --all-namespaces -o json)
echo "Limit Ranges:"
echo "$limitranges" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Limits: \(.spec.limits)"'
# Add to JSON
quotas_and_limits=$(jq -n \
--argjson quotas "$quotas" \
--argjson limitranges "$limitranges" \
'{quotas: $quotas, limitranges: $limitranges}')
write_to_json "Quotas and Limits" "Info" "$quotas_and_limits"
echo ""
}
# Vérification du stockage persistant
check_storage() {
print_message "Verification of persistent storage"
pv_status=$(oc get pv -o json)
pvc_status=$(oc get pvc --all-namespaces -o json)
echo "Persistent Volumes:"
echo "$pv_status" | jq -r '.items[] | "Name: \(.metadata.name), Capacity: \(.spec.capacity.storage), Status: \(.status.phase)"'
echo ""
echo "Persistent Volume Claims:"
echo "$pvc_status" | jq -r '.items[] | "Namespace: \(.metadata.namespace), Name: \(.metadata.name), Status: \(.status.phase), Volume: \(.spec.volumeName)"'
storage_info=$(jq -n \
--argjson pv "$pv_status" \
--argjson pvc "$pvc_status" \
'{persistent_volumes: $pv, persistent_volume_claims: $pvc}')
write_to_json "Persistent Storage" "Info" "$storage_info"
echo ""
}
# Vérification des certificats
check_certificates() {
print_message "Detailed verification of certificates"
# Check pending CSRs
pending_csr=$(oc get csr -o json | jq '.items[] | select(.status == null)')
pending_csr_count=$(echo "$pending_csr" | jq 'length')
echo "Pending Certificate Signing Requests (CSRs): $pending_csr_count"
# Check main certificates
cert_files=(
"/etc/origin/master/master.server.crt"
"/etc/origin/master/etcd.server.crt"
"/etc/origin/master/ca.crt"
"/etc/origin/node/server.crt"
)
cert_names=(
"Master Server"
"etcd Server"
"Cluster CA"
"Node Server"
)
cert_info=()
for i in "${!cert_files[@]}"; do
if [ -f "${cert_files[$i]}" ]; then
expiry_date=$(openssl x509 -in "${cert_files[$i]}" -noout -enddate | cut -d= -f2)
expiry_epoch=$(date -d "$expiry_date" +%s)
current_epoch=$(date +%s)
days_left=$(( ($expiry_epoch - $current_epoch) / 86400 ))
echo "Certificate: ${cert_names[$i]}"
echo "Expires on: $expiry_date"
echo "Days left: $days_left"
cert_status="OK"
if [ $days_left -lt 30 ]; then
echo -e "${RED}ALERT: Certificate expires in less than 30 days!${NC}"
cert_status="Critical"
elif [ $days_left -lt 90 ]; then
echo -e "${YELLOW}WARNING: Certificate expires in less than 90 days.${NC}"
cert_status="Warning"
else
echo -e "${GREEN}OK: Certificate is valid for more than 90 days.${NC}"
fi
cert_info+=("$(jq -n \
--arg name "${cert_names[$i]}" \
--arg expiry "$expiry_date" \
--arg days_left "$days_left" \
--arg status "$cert_status" \
'{name: $name, expiry: $expiry, days_left: $days_left | tonumber, status: $status}')")
echo ""
else
echo "File ${cert_files[$i]} does not exist."
cert_info+=("$(jq -n \
--arg name "${cert_names[$i]}" \
'{name: $name, status: "Error", message: "File not found"}')")
fi
done
certificates_data=$(jq -n \
--argjson pending "$pending_csr" \
--argjson certs "$(printf '%s\n' "${cert_info[@]}" | jq -s '.')" \
'{pending_csr: $pending, certificates: $certs}')
write_to_json "Certificates" "Info" "$certificates_data"
}
# Génération du rapport HTML
generate_html_report() {
cat << EOF > "$HTML_FILE"
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OKD Cluster Health Report</title>
<style>
body { font-family: Arial, sans-serif; line-height: 1.6; color: #333; max-width: 800px; margin: 0 auto; padding: 20px; }
h1 { color: #2c3e50; }
table { width: 100%; border-collapse: collapse; margin-bottom: 20px; }
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
th { background-color: #f2f2f2; }
.ok { color: green; }
.problem { color: red; }
.warning { color: orange; }
.info { color: blue; }
</style>
</head>
<body>
<h1>OKD Cluster Health Report</h1>
<table>
<tr>
<th>Component</th>
<th>Status</th>
<th>Details</th>
</tr>
EOF
jq -r '.[] | "<tr><td>\(.component)</td><td class=\"\(.status | ascii_downcase)\">\(.status)</td><td>\(.details)</td></tr>"' "$JSON_FILE" >> "$HTML_FILE"
echo "</table></body></html>" >> "$HTML_FILE"
}
# Fonction principale
main() {
# Initialiser le fichier JSON
echo "[]" > "$JSON_FILE"
# Exécuter les vérifications
check_prerequisites
check_nodes
check_node_readiness
check_system_pods
check_key_services
check_etcd
check_master_logs
check_resource_usage
check_router
check_registry
check_quotas_and_limits
check_storage
check_certificates
# Générer le rapport HTML
generate_html_report
echo "JSON report generated: $JSON_FILE"
echo "HTML report generated: $HTML_FILE"
# Définir les permissions appropriées pour les fichiers de sortie
chmod 600 "$JSON_FILE" "$HTML_FILE"
}
# Exécuter la fonction principale
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment