Skip to content

Instantly share code, notes, and snippets.

@ryran
Created June 10, 2019 21:19
Show Gist options
  • Save ryran/651d2c5ed44f94cf812c6c0235fc02be to your computer and use it in GitHub Desktop.
Save ryran/651d2c5ed44f94cf812c6c0235fc02be to your computer and use it in GitHub Desktop.
OCPv4: recover from expired control plane certificates
#!/bin/bash
#
# This extremely rough nonsense is an attempt to automate the disaster recovery
# expired certs documentation published at
# https://docs.openshift.com/container-platform/4.1/disaster_recovery/scenario-3-expired-certs.html
# ... Which was last reviewed on 2019/06/10
#
# Please contact [email protected] with suggestions or corrections
# CUSTOMIZE THESE:
MASTER=master0 # SSH hostname of master to use for bootstrapping everything
ALTMASTERS="master1 master2" # SSH hostnames of additional masters (comment if none)
WORKERS="worker0 worker1" # SSH hostnames of workers
RELEASE=4.1.0
RELEASE_IMAGE=quay.io/openshift-release-dev/ocp-release:${RELEASE}
c_YELLOW='\033[1;33m'
c_BLUE='\033[1;34m'
c_MAGENTA='\033[1;35m'
c_CYAN='\033[1;36m'
c_GREEN='\033[1;32m'
c_RED='\033[1;31m'
c_reset='\033[0;0m'
c_err=${c_RED}
c_msg=${c_YELLOW}
print() {
local c_host host=$1
shift
[[ ${host} == local ]] && c_host=${c_BLUE} || c_host=${c_MAGENTA}
printf "${c_host}[${host}] \t${c_msg}${@} ...${c_reset}\n"
}
errquit() {
printf "${c_err}ERROR: ${@}${c_reset}\n"
exit 1
}
print ${MASTER} "Inspecting ${RELEASE_IMAGE}"
KAO_IMAGE=$( ssh ${MASTER} oc adm release info --registry-config=/var/lib/kubelet/config.json ${RELEASE_IMAGE} --image-for=cluster-kube-apiserver-operator )
# Make sure master can pull the cluster-kube-apiserver-operator image
print ${MASTER} "Pulling ${KAO_IMAGE} w/podman"
ssh ${MASTER} sudo podman pull --authfile=/var/lib/kubelet/config.json ${KAO_IMAGE} || errquit "Unable to pull image '${KAO_IMAGE}' on ${MASTER}"
# Create recovery API server and grab stdout/stderr
print ${MASTER} "Kicking off 'recovery-apiserver create' w/podman"
podrun_output=$( ssh ${MASTER} sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} recovery-apiserver create 2>&1 )
# Grab kubeconfig filename path from output
kubeconfig_file=$( grep -o "export KUBECONFIG=.*" <<<"${podrun_output}" )
if ! [[ $kubeconfig_file ]]; then
echo --
echo "For debugging, here's all the output from the podman run cmd:"
echo "${podrun_output}"
echo --
errquit "Unable to get new kubeconfig credentials from recovery apiserver"
fi
destroy_command="ssh ${MASTER} sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} recovery-apiserver destroy"
cleanup() {
print local "If there were no problems OR if you want to run this script a second time, you should destroy the recovery-apiserver with the following command:"
printf "${c_GREEN}${destroy_command}${c_reset}\n"
}
trap cleanup EXIT INT
# Strip export "KUBECONFIG=" so we have only the filename
kubeconfig_file=${kubeconfig_file#*=}
# Create script which comes straight from docs
cat >restore_kubeconfig.sh <<\EOF
#!/bin/bash
set -eou pipefail
# context
intapi=$(oc get infrastructures.config.openshift.io cluster -o "jsonpath={.status.apiServerURL}")
context="$(oc config current-context)"
# cluster
cluster="$(oc config view -o "jsonpath={.contexts[?(@.name==\"$context\")].context.cluster}")"
server="$(oc config view -o "jsonpath={.clusters[?(@.name==\"$cluster\")].cluster.server}")"
# token
ca_crt_data="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.ca\.crt}" | base64 --decode)"
namespace="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.namespace}" | base64 --decode)"
token="$(oc get secret -n openshift-machine-config-operator node-bootstrapper-token -o "jsonpath={.data.token}" | base64 --decode)"
export KUBECONFIG="$(mktemp)"
kubectl config set-credentials "kubelet" --token="$token" >/dev/null
ca_crt="$(mktemp)"; echo "$ca_crt_data" > $ca_crt
kubectl config set-cluster $cluster --server="$intapi" --certificate-authority="$ca_crt" --embed-certs >/dev/null
kubectl config set-context kubelet --cluster="$cluster" --user="kubelet" >/dev/null
kubectl config use-context kubelet >/dev/null
cat "$KUBECONFIG"
EOF
cat >tmpscript.sh <<-EOF
#!/bin/bash
# Bail as soon as anything fails
set -e
print() {
printf "${c_MAGENTA}[${MASTER} tmpscript.sh] \t${c_msg}\${@} ...${c_reset}\n"
}
errquit() {
printf "${c_err}ERROR: \${@}${c_reset}\n"
exit 1
}
# Since we're gonna do oc cmds as core, we need to make this readable
print "Making ${kubeconfig_file} world-readable and exporting it as KUBECONFIG"
sudo chmod +r ${kubeconfig_file}
export KUBECONFIG=${kubeconfig_file}
# Wait for recovery API server to come up (shouldn't take long at all)
print "Waiting a bit for recovery apiserver"
sleep 20
print "DEBUG: checking auth/nodes"
oc whoami || :
oc get nodes || :
oc get namespace kube-system || :
timeout=5m
print "Waiting \${timeout} for 'oc get namespace kube-system' to succeed"
timeout \${timeout} bash -c 'until oc get namespace kube-system &>/dev/null; do echo Waiting for recovery apiserver to come up...; sleep 2; done'
# Run the regenerate-certificates command, fixing certs in API, overwriting old certs on local drive, and restarting static pods to pick them up
cmd="sudo podman run --net=host -v /etc/kubernetes/:/etc/kubernetes/:Z --entrypoint=/usr/bin/cluster-kube-apiserver-operator ${KAO_IMAGE} regenerate-certificates"
print "Executing cmd: \${cmd}"
\${cmd}
# Force new rollouts for control plane
# ("it will reinstall itself on the other nodes because the kubelet is connected to API servers using an internal load balancer")
print "Patching kubeapiserver to force redployment"
oc patch kubeapiserver cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge
print "Patching kubecontrollermanager to force redployment"
oc patch kubecontrollermanager cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge
print "Patching kubescheduler to force redployment"
oc patch kubescheduler cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge
# Create a bootstrap kubeconfig
print "Executing restore_kubeconfig.sh from https://docs.openshift.com/container-platform/4.1/disaster_recovery/scenario-3-expired-certs.html"
bash restore_kubeconfig.sh >kubeconfig
print "Copying ./kubeconfig to /etc/kubernetes/kubeconfig"
sudo cp kubeconfig /etc/kubernetes/kubeconfig
# Get the CA certificate used to validate connections from the API server
print "Grabbing CA cert (cm kube-apiserver-to-kubelet-client-ca in ns openshift-kube-apiserver-operator)"
oc get configmap kube-apiserver-to-kubelet-client-ca -n openshift-kube-apiserver-operator --template='{{ index .data "ca-bundle.crt" }}' >ca.crt
print "Copying ./ca.crt to /etc/kubernetes/ca.crt"
sudo cp ca.crt /etc/kubernetes/ca.crt
# Cleanup
rm -f restore_kubeconfig.sh kubeconfig ca.crt
# Recover the kubelet service (delete stale kubelet data)
print "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}"
sudo systemctl stop kubelet
sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig
sudo systemctl start kubelet
EOF
# Copy to master
print local "Copying tmpscript.sh & restore_kubeconfig.sh to ${MASTER}"
scp tmpscript.sh restore_kubeconfig.sh ${MASTER}:
# Do all our work
print local "Kicking off tmpscript.sh on ${MASTER}"
ssh ${MASTER} bash tmpscript.sh || errquit Aborting due to failure initiating recovery on ${MASTER}
# Grab new kubeconfig & ca.crt
print local "Grabbing /etc/kubernetes/{kubeconfig,ca.crt} from ${MASTER}"
scp ${MASTER}:/etc/kubernetes/kubeconfig . || errquit Aborting due to error grabbing /etc/kubernetes/kubeconfig from ${MASTER}
scp ${MASTER}:/etc/kubernetes/ca.crt . || errquit Aborting due to error grabbing /etc/kubernetes/ca.crt from ${MASTER}
# If we have more than 1 master...
if [[ ${ALTMASTERS} ]]; then
for altmaster in ${ALTMASTERS}; do
# Push bootstrap kubeconfig & new ca.crt
print local "Pushing ./{kubeconfig,ca.crt} to ${altmaster}"
scp kubeconfig ca.crt ${altmaster}: || errquit Aborting due to error pushing files to ${altmaster}
print ${altmaster} "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}"
ssh ${altmaster} <<-EOF
# Put new kubeconfig/ca.crt in place
sudo cp kubeconfig ca.crt /etc/kubernetes/
rm -f kubeconfig ca.crt
# Recover the kubelet service (delete stale kubelet data)
sudo systemctl stop kubelet
sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig
sudo systemctl start kubelet
EOF
done
fi
for worker in ${WORKERS}; do
print local "Pushing ./ca.crt to ${worker}"
scp ca.crt ${worker}: || errquit Aborting due to error pushing files to ${worker}
print ${worker} "Stopping kubelet.service and clearing out /var/lib/kubelet/{pki,kubeconfig}"
ssh ${worker} <<-EOF
# Put new ca.crt in place
sudo cp ca.crt /etc/kubernetes/
rm -f ca.crt
# Recover the kubelet service (delete stale kubelet data)
sudo systemctl stop kubelet
sudo rm -rf /var/lib/kubelet/pki /var/lib/kubelet/kubeconfig
sudo systemctl start kubelet
EOF
done
print ${MASTER} "Approving pending CSRs"
ssh ${MASTER} <<-EOF
export KUBECONFIG=${kubeconfig_file}
# Approve the pending node-bootstrapper CSRs
oc get csr --no-headers | awk '\$4=="Pending"'
oc get csr --no-headers | awk '\$4=="Pending" {system("oc adm certificate approve "\$1)}'
# Destroy the recovery API server and cleanup containers
#${destroy_command}
#sudo podman rm -a
# (Handled by bash trap right now)
EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment