Created
May 12, 2026 12:28
-
-
Save mcornea/095d99b0d5b383d3dcd0f9cf858189c2 to your computer and use it in GitHub Desktop.
run_karpenter_pprof_analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================= | |
| # Karpenter / kube-apiserver / etcd — pprof Continuous Capture | |
| # ============================================================================= | |
| # Usage: ./run_karpenter_pprof_analysis.sh [OPTIONS] | |
| # | |
| # Captures CPU and heap pprof profiles from karpenter, kube-apiserver, and etcd | |
| # every interval (default 60s) until stopped with Ctrl+C. | |
| # On exit, produces a summary analysis of all captured data. | |
| # | |
| # This is for ROSA HCP clusters where karpenter, etcd, and kube-apiserver all | |
| # run in the management cluster's HCP namespace, and kube-apiserver is accessed | |
| # via the hosted cluster kubeconfig. | |
| # | |
| # Options: | |
| # --output-dir DIR Output directory (default: ./pprof-analysis-YYYYMMDD-HHMMSS) | |
| # --interval SECS Capture interval in seconds (default: 60) | |
| # --cpu-duration SECS CPU profile duration in seconds (default: 30) | |
| # --mc-kubeconfig PATH Management cluster kubeconfig | |
| # --hc-kubeconfig PATH Hosted cluster kubeconfig | |
| # | |
| # Requirements: | |
| # - oc | |
| # - go (optional, for pprof analysis at the end) | |
| # ============================================================================= | |
| set -uo pipefail | |
| # --------------- Configuration --------------- | |
| OUTPUT_DIR="" | |
| CAPTURE_INTERVAL=60 | |
| CPU_DURATION=30 | |
| MAX_CAPTURES=0 | |
| MC_KUBECONFIG="/home/marius/rosa-create/autonode/mc_kubeconfig" | |
| HC_KUBECONFIG="/home/marius/rosa-create/autonode/hc_kubeconfig" | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --output-dir) OUTPUT_DIR="$2"; shift 2 ;; | |
| --interval) CAPTURE_INTERVAL="$2"; shift 2 ;; | |
| --cpu-duration) CPU_DURATION="$2"; shift 2 ;; | |
| --max-captures) MAX_CAPTURES="$2"; shift 2 ;; | |
| --mc-kubeconfig) MC_KUBECONFIG="$2"; shift 2 ;; | |
| --hc-kubeconfig) HC_KUBECONFIG="$2"; shift 2 ;; | |
| *) echo "Unknown option: $1" >&2; exit 1 ;; | |
| esac | |
| done | |
| OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)/pprof-analysis-$(date +%Y%m%d-%H%M%S)}" | |
| # Shorthand for oc with the right kubeconfig | |
| mc_oc() { oc --kubeconfig="$MC_KUBECONFIG" "$@"; } | |
| hc_oc() { oc --kubeconfig="$HC_KUBECONFIG" "$@"; } | |
| # --------------- Preflight checks --------------- | |
| echo "==============================================" | |
| echo " Karpenter/KAS/etcd pprof Continuous Capture" | |
| echo "==============================================" | |
| echo "" | |
| if ! command -v oc &>/dev/null; then | |
| echo "ERROR: 'oc' not found in PATH" >&2; exit 1 | |
| fi | |
| echo "[*] Verifying kubeconfigs..." | |
| if ! mc_oc whoami &>/dev/null; then | |
| echo "ERROR: MC kubeconfig failed (${MC_KUBECONFIG})" >&2; exit 1 | |
| fi | |
| MC_USER=$(mc_oc whoami 2>/dev/null) | |
| echo " MC: logged in as $MC_USER" | |
| if ! hc_oc whoami &>/dev/null; then | |
| echo "ERROR: HC kubeconfig failed (${HC_KUBECONFIG})" >&2; exit 1 | |
| fi | |
| HC_USER=$(hc_oc whoami 2>/dev/null) | |
| echo " HC: logged in as $HC_USER" | |
| # --------------- Discover HCP namespace --------------- | |
| echo "" | |
| echo "[*] Discovering HCP namespace..." | |
| HCP_NS=$(mc_oc get pods -A -l app=karpenter --no-headers -o custom-columns=NS:.metadata.namespace 2>/dev/null | head -1) | |
| if [[ -z "$HCP_NS" ]]; then | |
| echo "ERROR: Could not find karpenter pod / HCP namespace" >&2; exit 1 | |
| fi | |
| echo " HCP namespace: $HCP_NS" | |
| # --------------- Discover pods --------------- | |
| echo "" | |
| echo "[*] Discovering pods..." | |
| # Karpenter | |
| KARPENTER_POD=$(mc_oc get pods -n "$HCP_NS" -l app=karpenter --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1) | |
| if [[ -z "$KARPENTER_POD" ]]; then | |
| echo "ERROR: No karpenter pod found" >&2; exit 1 | |
| fi | |
| echo " Karpenter: $KARPENTER_POD" | |
| # etcd | |
| mapfile -t ETCD_PODS < <(mc_oc get pods -n "$HCP_NS" -l app=etcd --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | sort) | |
| if [[ ${#ETCD_PODS[@]} -eq 0 ]]; then | |
| echo "ERROR: No etcd pods found" >&2; exit 1 | |
| fi | |
| echo " etcd: ${ETCD_PODS[*]}" | |
| # kube-apiserver | |
| mapfile -t KAS_PODS < <(mc_oc get pods -n "$HCP_NS" -l app=kube-apiserver --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | sort) | |
| if [[ ${#KAS_PODS[@]} -eq 0 ]]; then | |
| echo "ERROR: No kube-apiserver pods found" >&2; exit 1 | |
| fi | |
| echo " kube-apiserver: ${KAS_PODS[*]}" | |
| # --------------- etcd TLS cert paths --------------- | |
| ETCD_CACERT="/etc/etcd/tls/etcd-ca/ca.crt" | |
| ETCD_CERT="/etc/etcd/tls/client/etcd-client.crt" | |
| ETCD_KEY="/etc/etcd/tls/client/etcd-client.key" | |
| # --------------- Verify pprof endpoints --------------- | |
| echo "" | |
| echo "[*] Verifying pprof endpoints..." | |
| # etcd | |
| FIRST_ETCD="${ETCD_PODS[0]}" | |
| ETCD_HTTP=$(mc_oc exec -n "$HCP_NS" "$FIRST_ETCD" -c etcd -- \ | |
| curl -s -o /dev/null -w '%{http_code}' \ | |
| --cacert "$ETCD_CACERT" --cert "$ETCD_CERT" --key "$ETCD_KEY" \ | |
| https://localhost:2379/debug/pprof/heap 2>&1) | |
| if [[ "$ETCD_HTTP" != "200" ]]; then | |
| echo "ERROR: etcd pprof returned HTTP $ETCD_HTTP" >&2; exit 1 | |
| fi | |
| echo " etcd pprof: OK (HTTP 200)" | |
| # kube-apiserver — pipe through wc -c to avoid null bytes in command substitution | |
| KAS_SIZE=$(hc_oc get --raw /debug/pprof/heap 2>/dev/null | wc -c) | |
| if [[ "$KAS_SIZE" -lt 100 ]]; then | |
| echo "ERROR: kube-apiserver pprof not accessible (got $KAS_SIZE bytes)" >&2; exit 1 | |
| fi | |
| echo " kube-apiserver pprof: OK ($KAS_SIZE bytes)" | |
| # --------------- Enable karpenter profiling --------------- | |
| echo "" | |
| echo "[*] Checking karpenter profiling..." | |
| KARPENTER_ARGS=$(mc_oc get deployment karpenter -n "$HCP_NS" -o jsonpath='{.spec.template.spec.containers[0].args}' 2>/dev/null) | |
| PPROF_LOCAL_PORT=16060 | |
| if echo "$KARPENTER_ARGS" | grep -q -- '--enable-profiling'; then | |
| echo " --enable-profiling already set" | |
| else | |
| echo " Patching karpenter deployment to add --enable-profiling..." | |
| mc_oc patch deployment karpenter -n "$HCP_NS" --type=json \ | |
| -p '[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--enable-profiling"}]' | |
| echo " Waiting for rollout..." | |
| mc_oc rollout status deployment/karpenter -n "$HCP_NS" --timeout=120s | |
| # Re-discover pod name after rollout | |
| KARPENTER_POD=$(mc_oc get pods -n "$HCP_NS" -l app=karpenter --no-headers -o custom-columns=NAME:.metadata.name 2>/dev/null | head -1) | |
| echo " New karpenter pod: $KARPENTER_POD" | |
| fi | |
| # Verify karpenter pprof via port-forward (container may not have curl) | |
| mc_oc port-forward -n "$HCP_NS" "pod/$KARPENTER_POD" ${PPROF_LOCAL_PORT}:8080 &>/dev/null & | |
| VALIDATE_PF_PID=$! | |
| sleep 3 | |
| KARP_HTTP=$(curl -s -o /dev/null -w '%{http_code}' "http://localhost:${PPROF_LOCAL_PORT}/debug/pprof/heap" 2>&1) | |
| kill $VALIDATE_PF_PID 2>/dev/null; wait $VALIDATE_PF_PID 2>/dev/null | |
| if [[ "$KARP_HTTP" != "200" ]]; then | |
| echo "WARNING: karpenter pprof returned HTTP $KARP_HTTP (profiling may not be enabled yet)" >&2 | |
| else | |
| echo " karpenter pprof: OK (HTTP 200)" | |
| fi | |
| # --------------- Create output directories --------------- | |
| echo "" | |
| echo "[*] Output directory: $OUTPUT_DIR" | |
| mkdir -p "$OUTPUT_DIR/profiles" | |
| # Save metadata | |
| cat > "$OUTPUT_DIR/metadata.txt" <<METADATA | |
| Date: $(date -u '+%Y-%m-%dT%H:%M:%SZ') | |
| HCP Namespace: $HCP_NS | |
| MC Kubeconfig: $MC_KUBECONFIG | |
| HC Kubeconfig: $HC_KUBECONFIG | |
| Karpenter Pod: $KARPENTER_POD | |
| etcd Pods: ${ETCD_PODS[*]} | |
| KAS Pods: ${KAS_PODS[*]} | |
| Capture Interval: ${CAPTURE_INTERVAL}s | |
| CPU Profile Duration: ${CPU_DURATION}s | |
| Script: $0 | |
| METADATA | |
| # ============================================================================= | |
| # Capture functions | |
| # ============================================================================= | |
| capture_karpenter() { | |
| local OUTDIR="$1" LABEL="$2" | |
| # Start port-forward for this capture round (container may not have curl) | |
| mc_oc port-forward -n "$HCP_NS" "pod/$KARPENTER_POD" ${PPROF_LOCAL_PORT}:8080 &>/dev/null & | |
| local PF_PID=$! | |
| sleep 2 | |
| echo " [karpenter] heap..." | |
| curl -s "http://localhost:${PPROF_LOCAL_PORT}/debug/pprof/heap" \ | |
| > "${OUTDIR}/karpenter_heap_${LABEL}.pb.gz" 2>/dev/null || true | |
| echo " [karpenter] cpu (${CPU_DURATION}s)..." | |
| curl -s --max-time $((CPU_DURATION + 10)) \ | |
| "http://localhost:${PPROF_LOCAL_PORT}/debug/pprof/profile?seconds=${CPU_DURATION}" \ | |
| > "${OUTDIR}/karpenter_cpu_${LABEL}.pb.gz" 2>/dev/null || true | |
| kill $PF_PID 2>/dev/null; wait $PF_PID 2>/dev/null | |
| } | |
| capture_etcd() { | |
| local OUTDIR="$1" LABEL="$2" | |
| for i in "${!ETCD_PODS[@]}"; do | |
| local POD="${ETCD_PODS[$i]}" | |
| echo " [etcd-${i}] heap..." | |
| mc_oc exec -n "$HCP_NS" "$POD" -c etcd -- \ | |
| curl -s --cacert "$ETCD_CACERT" --cert "$ETCD_CERT" --key "$ETCD_KEY" \ | |
| https://localhost:2379/debug/pprof/heap \ | |
| > "${OUTDIR}/etcd_${i}_heap_${LABEL}.pb.gz" 2>/dev/null || true | |
| echo " [etcd-${i}] cpu (${CPU_DURATION}s)..." | |
| mc_oc exec -n "$HCP_NS" "$POD" -c etcd -- \ | |
| curl -s --cacert "$ETCD_CACERT" --cert "$ETCD_CERT" --key "$ETCD_KEY" \ | |
| "https://localhost:2379/debug/pprof/profile?seconds=${CPU_DURATION}" \ | |
| > "${OUTDIR}/etcd_${i}_cpu_${LABEL}.pb.gz" 2>/dev/null || true | |
| done | |
| } | |
| capture_kas() { | |
| local OUTDIR="$1" LABEL="$2" | |
| echo " [kas] heap..." | |
| hc_oc get --raw /debug/pprof/heap \ | |
| > "${OUTDIR}/kas_heap_${LABEL}.pb.gz" 2>/dev/null || true | |
| echo " [kas] cpu (${CPU_DURATION}s)..." | |
| hc_oc get --raw "/debug/pprof/profile?seconds=${CPU_DURATION}" \ | |
| > "${OUTDIR}/kas_cpu_${LABEL}.pb.gz" 2>/dev/null || true | |
| } | |
| # ============================================================================= | |
| # Inline summary helper | |
| # ============================================================================= | |
| _heap_inuse_mib() { | |
| # Extract total inuse_space from a .pb.gz heap profile using go tool pprof | |
| local PB="$1" | |
| if [[ -f "$PB" ]] && command -v go &>/dev/null; then | |
| go tool pprof -top -inuse_space "$PB" 2>/dev/null | awk '/^Showing/{ | |
| # "Showing nodes accounting for 61.64MB, 100% of 61.64MB total" | |
| for(i=1;i<=NF;i++){ | |
| if($i=="of"){ | |
| val=$(i+1) | |
| gsub(/MB/,"",val) | |
| if(val ~ /GB/) { gsub(/GB/,"",val); val=val*1024 } | |
| printf "%.1f", val | |
| exit | |
| } | |
| } | |
| }' | |
| fi | |
| } | |
| print_heap_summary() { | |
| local OUTDIR="$1" LABEL="$2" | |
| echo -n " Heap sizes (inuse): " | |
| # Karpenter | |
| local KARP_HA | |
| KARP_HA=$(_heap_inuse_mib "${OUTDIR}/karpenter_heap_${LABEL}.pb.gz") | |
| echo -n "karpenter=${KARP_HA:-?}MiB " | |
| # etcd | |
| for i in "${!ETCD_PODS[@]}"; do | |
| local HA | |
| HA=$(_heap_inuse_mib "${OUTDIR}/etcd_${i}_heap_${LABEL}.pb.gz") | |
| echo -n "etcd-${i}=${HA:-?}MiB " | |
| done | |
| # KAS | |
| local KAS_HA | |
| KAS_HA=$(_heap_inuse_mib "${OUTDIR}/kas_heap_${LABEL}.pb.gz") | |
| echo "kas=${KAS_HA:-?}MiB" | |
| } | |
| # ============================================================================= | |
| # Analysis helpers (run on Ctrl+C exit) | |
| # ============================================================================= | |
| generate_heap_texts() { | |
| # Generate text heap dumps from .pb.gz files for memstats extraction | |
| echo " Generating heap text dumps from .pb.gz profiles..." | |
| for PB in "$OUTPUT_DIR"/profiles/*_heap_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local TXT="${PB%.pb.gz}.txt" | |
| [[ -f "$TXT" ]] && continue # skip if already exists | |
| go tool pprof -text -inuse_space "$PB" > "$TXT" 2>/dev/null || true | |
| done | |
| } | |
| extract_memstats() { | |
| local SUMMARY="$OUTPUT_DIR/memstats_summary.tsv" | |
| echo -e "component\tcapture\tinuse_space_MiB" > "$SUMMARY" | |
| for PB in "$OUTPUT_DIR"/profiles/*_heap_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local FNAME COMPONENT LABEL | |
| FNAME=$(basename "$PB" .pb.gz) | |
| # Parse component and label from filename patterns: | |
| # karpenter_heap_0001, etcd_0_heap_0001, kas_heap_0001 | |
| if [[ "$FNAME" =~ ^karpenter_heap_(.+)$ ]]; then | |
| COMPONENT="karpenter" | |
| LABEL="${BASH_REMATCH[1]}" | |
| elif [[ "$FNAME" =~ ^etcd_([0-9]+)_heap_(.+)$ ]]; then | |
| COMPONENT="etcd-${BASH_REMATCH[1]}" | |
| LABEL="${BASH_REMATCH[2]}" | |
| elif [[ "$FNAME" =~ ^kas_heap_(.+)$ ]]; then | |
| COMPONENT="kas" | |
| LABEL="${BASH_REMATCH[1]}" | |
| else | |
| continue | |
| fi | |
| local INUSE | |
| INUSE=$(_heap_inuse_mib "$PB") | |
| echo -e "${COMPONENT}\t${LABEL}\t${INUSE:-0}" >> "$SUMMARY" | |
| done | |
| echo " MemStats summary: $SUMMARY" | |
| } | |
| run_pprof_top_heap() { | |
| local TOPFILE="$OUTPUT_DIR/top_allocations.txt" | |
| if ! command -v go &>/dev/null; then | |
| echo " SKIP: 'go' not in PATH, cannot run pprof top analysis" | tee "$TOPFILE" | |
| return | |
| fi | |
| { | |
| echo "================================================================================" | |
| echo " Top Heap Allocators (go tool pprof -top -inuse_space)" | |
| echo "================================================================================" | |
| for COMPONENT in karpenter etcd_0 etcd_1 etcd_2 kas; do | |
| local PEAK_PROFILE="" | |
| local PEAK_SIZE=0 | |
| for PB in "$OUTPUT_DIR"/profiles/${COMPONENT}_heap_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local SZ | |
| SZ=$(stat -c%s "$PB" 2>/dev/null || echo 0) | |
| if (( SZ > PEAK_SIZE )); then | |
| PEAK_SIZE=$SZ | |
| PEAK_PROFILE="$PB" | |
| fi | |
| done | |
| if [[ -z "$PEAK_PROFILE" ]]; then | |
| continue | |
| fi | |
| echo "" | |
| echo "--- ${COMPONENT} peak: $(basename "$PEAK_PROFILE") ---" | |
| echo "" | |
| go tool pprof -top -inuse_space "$PEAK_PROFILE" 2>&1 || echo "(pprof failed)" | |
| done | |
| } > "$TOPFILE" 2>&1 | |
| echo " Top allocations: $TOPFILE" | |
| } | |
| run_pprof_top_cpu() { | |
| local TOPFILE="$OUTPUT_DIR/top_cpu.txt" | |
| if ! command -v go &>/dev/null; then | |
| echo " SKIP: 'go' not in PATH, cannot run pprof CPU analysis" | tee "$TOPFILE" | |
| return | |
| fi | |
| { | |
| echo "================================================================================" | |
| echo " Top CPU Consumers (go tool pprof -top)" | |
| echo "================================================================================" | |
| for COMPONENT in karpenter etcd_0 etcd_1 etcd_2 kas; do | |
| local PEAK_PROFILE="" | |
| local PEAK_SIZE=0 | |
| for PB in "$OUTPUT_DIR"/profiles/${COMPONENT}_cpu_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local SZ | |
| SZ=$(stat -c%s "$PB" 2>/dev/null || echo 0) | |
| if (( SZ > PEAK_SIZE )); then | |
| PEAK_SIZE=$SZ | |
| PEAK_PROFILE="$PB" | |
| fi | |
| done | |
| if [[ -z "$PEAK_PROFILE" ]]; then | |
| continue | |
| fi | |
| echo "" | |
| echo "--- ${COMPONENT} peak CPU: $(basename "$PEAK_PROFILE") ---" | |
| echo "" | |
| go tool pprof -top "$PEAK_PROFILE" 2>&1 || echo "(pprof failed)" | |
| done | |
| } > "$TOPFILE" 2>&1 | |
| echo " Top CPU: $TOPFILE" | |
| } | |
| generate_svgs() { | |
| local SVGDIR="$OUTPUT_DIR/svgs" | |
| mkdir -p "$SVGDIR" | |
| if ! command -v go &>/dev/null; then | |
| echo " SKIP: 'go' not in PATH, cannot generate SVGs" | |
| return | |
| fi | |
| for COMPONENT in karpenter etcd_0 etcd_1 etcd_2 kas; do | |
| # Heap SVG — pick the largest (peak) profile | |
| local PEAK_HEAP="" PEAK_SIZE=0 | |
| for PB in "$OUTPUT_DIR"/profiles/${COMPONENT}_heap_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local SZ | |
| SZ=$(stat -c%s "$PB" 2>/dev/null || echo 0) | |
| if (( SZ > PEAK_SIZE )); then | |
| PEAK_SIZE=$SZ | |
| PEAK_HEAP="$PB" | |
| fi | |
| done | |
| if [[ -n "$PEAK_HEAP" ]]; then | |
| local BNAME | |
| BNAME=$(basename "$PEAK_HEAP" .pb.gz) | |
| echo " [svg] ${BNAME} heap..." | |
| go tool pprof -svg -inuse_space "$PEAK_HEAP" > "${SVGDIR}/${BNAME}_inuse.svg" 2>/dev/null || true | |
| go tool pprof -svg -alloc_space "$PEAK_HEAP" > "${SVGDIR}/${BNAME}_alloc.svg" 2>/dev/null || true | |
| fi | |
| # CPU SVG — pick the largest profile | |
| local PEAK_CPU="" PEAK_SIZE=0 | |
| for PB in "$OUTPUT_DIR"/profiles/${COMPONENT}_cpu_*.pb.gz; do | |
| [[ -f "$PB" ]] || continue | |
| local SZ | |
| SZ=$(stat -c%s "$PB" 2>/dev/null || echo 0) | |
| if (( SZ > PEAK_SIZE )); then | |
| PEAK_SIZE=$SZ | |
| PEAK_CPU="$PB" | |
| fi | |
| done | |
| if [[ -n "$PEAK_CPU" ]]; then | |
| local BNAME | |
| BNAME=$(basename "$PEAK_CPU" .pb.gz) | |
| echo " [svg] ${BNAME} cpu..." | |
| go tool pprof -svg "$PEAK_CPU" > "${SVGDIR}/${BNAME}.svg" 2>/dev/null || true | |
| fi | |
| done | |
| echo " SVGs: $SVGDIR" | |
| } | |
| generate_report() { | |
| local REPORT="$OUTPUT_DIR/analysis_report.txt" | |
| local CAPTURE_COUNT | |
| CAPTURE_COUNT=$(ls "$OUTPUT_DIR"/profiles/karpenter_heap_*.pb.gz 2>/dev/null | wc -l) | |
| { | |
| echo "================================================================================" | |
| echo " Karpenter / kube-apiserver / etcd — pprof Analysis Report" | |
| echo "================================================================================" | |
| echo "" | |
| echo "HCP Namespace: $HCP_NS" | |
| echo "Start: $START_TS" | |
| echo "End: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| echo "Interval: ${CAPTURE_INTERVAL}s" | |
| echo "CPU Duration: ${CPU_DURATION}s" | |
| echo "Captures: $CAPTURE_COUNT" | |
| echo "Karpenter Pod: $KARPENTER_POD" | |
| echo "etcd Pods: ${ETCD_PODS[*]}" | |
| echo "KAS Pods: ${KAS_PODS[*]}" | |
| echo "" | |
| echo "================================================================================" | |
| echo " Go Heap MemStats" | |
| echo "================================================================================" | |
| echo "" | |
| column -t -s$'\t' "$OUTPUT_DIR/memstats_summary.tsv" 2>/dev/null || \ | |
| cat "$OUTPUT_DIR/memstats_summary.tsv" 2>/dev/null || echo "(not available)" | |
| echo "" | |
| echo "================================================================================" | |
| echo " Top Heap Allocators" | |
| echo "================================================================================" | |
| echo "" | |
| cat "$OUTPUT_DIR/top_allocations.txt" 2>/dev/null || echo "(not available)" | |
| echo "" | |
| echo "================================================================================" | |
| echo " Top CPU Consumers" | |
| echo "================================================================================" | |
| echo "" | |
| cat "$OUTPUT_DIR/top_cpu.txt" 2>/dev/null || echo "(not available)" | |
| echo "" | |
| echo "================================================================================" | |
| echo " Files" | |
| echo "================================================================================" | |
| echo "" | |
| find "$OUTPUT_DIR" -type f | sort | sed "s|$OUTPUT_DIR/||" | |
| } > "$REPORT" | |
| echo "" | |
| echo "==============================================" | |
| echo " Analysis complete!" | |
| echo "==============================================" | |
| echo " Report: $REPORT" | |
| echo " Output: $OUTPUT_DIR" | |
| echo " Captures: $CAPTURE_COUNT" | |
| echo "" | |
| echo " Interactive exploration:" | |
| echo " # Karpenter heap (peak)" | |
| echo " go tool pprof -http=:8080 \$(ls -S $OUTPUT_DIR/profiles/karpenter_heap_*.pb.gz | head -1)" | |
| echo "" | |
| echo " # Karpenter CPU (peak)" | |
| echo " go tool pprof -http=:8080 \$(ls -S $OUTPUT_DIR/profiles/karpenter_cpu_*.pb.gz | head -1)" | |
| echo "" | |
| echo " # etcd heap (peak)" | |
| echo " go tool pprof -http=:8080 \$(ls -S $OUTPUT_DIR/profiles/etcd_0_heap_*.pb.gz | head -1)" | |
| echo "" | |
| echo " # KAS heap (peak)" | |
| echo " go tool pprof -http=:8080 \$(ls -S $OUTPUT_DIR/profiles/kas_heap_*.pb.gz | head -1)" | |
| echo "" | |
| } | |
| # ============================================================================= | |
| # Cleanup on exit (Ctrl+C or SIGTERM) | |
| # ============================================================================= | |
| RUNNING=true | |
| CAPTURE_PIDS="" | |
| run_analysis() { | |
| echo "" | |
| echo "[*] Running analysis on captured data..." | |
| generate_heap_texts | |
| extract_memstats | |
| run_pprof_top_heap | |
| run_pprof_top_cpu | |
| generate_svgs | |
| generate_report | |
| } | |
| cleanup() { | |
| # Guard against re-entrant signals | |
| trap '' SIGINT SIGTERM | |
| echo "" | |
| echo "" | |
| echo "[*] Caught signal — stopping..." | |
| RUNNING=false | |
| # Kill any running capture subprocesses (use SIGTERM since children ignore SIGINT) | |
| if [[ -n "$CAPTURE_PIDS" ]]; then | |
| echo "[*] Killing capture subprocesses..." | |
| kill -TERM $CAPTURE_PIDS 2>/dev/null | |
| wait $CAPTURE_PIDS 2>/dev/null | |
| fi | |
| run_analysis | |
| exit 0 | |
| } | |
| trap cleanup SIGINT SIGTERM | |
| # ============================================================================= | |
| # MAIN: continuous capture loop | |
| # ============================================================================= | |
| START_TS=$(date -u '+%Y-%m-%dT%H:%M:%SZ') | |
| START_EPOCH=$(date +%s) | |
| echo "" | |
| echo "==============================================" | |
| echo " Capturing profiles every ${CAPTURE_INTERVAL}s" | |
| echo " CPU profiles: ${CPU_DURATION}s each" | |
| echo " Press Ctrl+C to stop and run analysis" | |
| echo "==============================================" | |
| echo "" | |
| CAPTURE_NUM=0 | |
| while $RUNNING; do | |
| CAPTURE_NUM=$((CAPTURE_NUM + 1)) | |
| ELAPSED=$(( $(date +%s) - START_EPOCH )) | |
| ELAPSED_MIN=$(( ELAPSED / 60 )) | |
| ELAPSED_SEC=$(( ELAPSED % 60 )) | |
| LABEL=$(printf "%04d" "$CAPTURE_NUM") | |
| echo "[$(date -u '+%H:%M:%S')] Capture #${CAPTURE_NUM} (T+${ELAPSED_MIN}m${ELAPSED_SEC}s)..." | |
| # Run captures in subshells that ignore SIGINT so only the parent catches it | |
| (trap '' SIGINT; capture_karpenter "$OUTPUT_DIR/profiles" "$LABEL") & | |
| PID_KARP=$! | |
| (trap '' SIGINT; capture_etcd "$OUTPUT_DIR/profiles" "$LABEL") & | |
| PID_ETCD=$! | |
| (trap '' SIGINT; capture_kas "$OUTPUT_DIR/profiles" "$LABEL") & | |
| PID_KAS=$! | |
| CAPTURE_PIDS="$PID_KARP $PID_ETCD $PID_KAS" | |
| # Wait for children. If SIGINT arrives, the trap fires (children ignore it). | |
| # The trap sets RUNNING=false and kills children, so we check after wait. | |
| wait $CAPTURE_PIDS 2>/dev/null | |
| CAPTURE_PIDS="" | |
| $RUNNING || break | |
| print_heap_summary "$OUTPUT_DIR/profiles" "$LABEL" | |
| # Stop after max captures if set | |
| if (( MAX_CAPTURES > 0 && CAPTURE_NUM >= MAX_CAPTURES )); then | |
| echo "" | |
| echo "[*] Reached max captures ($MAX_CAPTURES), stopping..." | |
| RUNNING=false | |
| break | |
| fi | |
| # Wait for next interval, checking RUNNING every second | |
| WAITED=0 | |
| while (( WAITED < CAPTURE_INTERVAL )) && $RUNNING; do | |
| sleep 1 | |
| WAITED=$((WAITED + 1)) | |
| done | |
| done | |
| # If we exited the loop via max-captures (not signal), run analysis | |
| if ! $RUNNING; then | |
| run_analysis | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment