Skip to content

Instantly share code, notes, and snippets.

@bhamilton
Last active April 27, 2023 21:32
Show Gist options
  • Save bhamilton/90182aa335ed3b0d928c8ca649bbc769 to your computer and use it in GitHub Desktop.
Save bhamilton/90182aa335ed3b0d928c8ca649bbc769 to your computer and use it in GitHub Desktop.
#!/bin/bash
function is_active() {
local s
s=$(echo "$BTOOL_INFO" | grep Nginx | awk -F":" '{print $2}' | xargs)
if [ -z "${s}" ] ; then
return 1
fi
if [ "${s}" == "running" ]; then
return 0
fi
return 1
}
function wait_file() {
local file="$1"; shift
local wait_seconds="${1:-10}"; shift # 10 seconds as default timeout
test "$wait_seconds" -lt 1 && echo 'At least 1 second is required' && return 1
until test $((wait_seconds--)) -eq 0 -o -e "$file" ; do sleep 1; done
test $wait_seconds -ge 0 # If we timed out, exit with a non-zero code
}
function pull_image() {
local repo=$1
local image=$2
local tag=$3
local imagepath
local sock
local response
local runc_artifact
if [ $# -lt 3 ]; then
echo "Missing argument for pull_image <repo> <image> <tag>"
# Set global error response code to fail job with.
ERROR_RESPONSE=11
return 1
fi
# Image manager API does a pull, then a unpack.
echo "Pulling and unpacking image ${image}"
imagepath="${repo}/${image}${tag}"
sock="/var/run/cos-image-manager/manager.sock"
/chef-container.sh /bin/bash -c "echo 'unpack ${imagepath}' | socat - UNIX-CONNECT:$sock"
runc_artifact="/mnt/disks/data/rootfs/quay.io/getpantheon/cos-runtime-php/${original_php_tag}"
if wait_file "$runc_artifact" 120; then
echo "Successfully pulled ${imagepath}"
return 0
else
echo "Error pulling image: ${response}"
# Set global error response code to fail job with.
ERROR_RESPONSE=12
return 1
fi
}
function healthcheck() {
local binding_url
local status
local current_php_tag
binding_url=$(echo "$BTOOL_INFO" | grep 'Binding URL' | cut -f2- -d: | xargs)
status=$(curl -ISs "${binding_url}pantheon_healthcheck" | grep -E "^HTTP/" | awk '{print $2}')
current_php_tag="$(basename "$(sudo cat "${CONFIG_FILE}" | jq -r .root.path)")"
if [ "$current_php_tag" = "$original_php_tag" ]; then
# Bailing out early without performing the /pantheon_healthcheck if a new cos-runtime-php image
# has not been pushed to prevent the job from emitting noise that is not related to a php update.
echo "[no-op] The image for cos-runtime-php has not changed since the last chef-solo-bindings run."
return
fi
if [ "$status" -ne 200 ]; then
# The current image tag placed by chef is failing our healthcheck.
if pull_image "quay.io/getpantheon" "cos-runtime-php" "@sha256:${original_php_tag}"; then
# Revert to the image that was present before chef-solo ran.
# Restart the container to pickup changes.
cat "${CONFIG_FILE}" | jq --arg current "${current_php_tag}" --arg original "${original_php_tag}" '.root.path |= (sub($current; $original))' | tee "${CONFIG_FILE}" 2>&1 >/dev/null
/chef-container.sh timeout -k 30s 600s btool -b "${binding_id}" restart php
# Set global error response code to fail job with.
ERROR_RESPONSE=13
fi
fi
}
echo "$comment"
export TAG=$infra_image_tag
# Record previous state, prior to chef-solo run.
CONFIG_FILE="/mnt/disks/data/containers/${binding_id}/runc/php/config.json"
original_php_tag="$(basename "$(sudo cat "${CONFIG_FILE}" | jq -r .root.path)")"
# merge the contents of /etc/pantheon/settings.json into this job's specific json values
CHEF_FILE="jenkins-chef-${JOB_NAME}-${BUILD_NUMBER}-${RANDOM}.json"
(cat /etc/pantheon/settings.json; cat <<EOF
{
"run_list": ["recipe[endpoint::bindings]"],
"binding_id": "$binding_id",
"job_id": "$job_id",
"task_id": "$task_id",
"workflow_id": "$workflow_id",
"trace_id": "$trace_id",
"max_bindings": "$max_bindings"
}
EOF
) | jq -s add > "$JENKINS_HOME"/chef/"$CHEF_FILE"
cat "$JENKINS_HOME"/chef/"$CHEF_FILE"
# chef-runner sets up the docker run, and takes the arguments we pass to it
# /chef will be the $JENKINS_HOME/chef dir inside the container
# prefix with TAG env var to controll wich release to use when running chef
/chef-container.sh chef-solo --no-fork -j /chef/"$CHEF_FILE" -l debug || ERROR_RESPONSE=$?
# We do not want to trigger socket activation and resurrect bindings to do
# this check. Similarly, explicitly checking for Nginx to ensure this
# remediation only gets applied to appservers.
BTOOL_INFO=$(/chef-container.sh PYTHONWARNINGS=ignore python /usr/local/bin/btool -b "${binding_id}" info)
if is_active; then
healthcheck
fi
rm -f "$JENKINS_HOME"/chef/"$CHEF_FILE"
# Error Responses:
# chef: https://github.com/chef/chef/blob/main/docs/dev/design_documents/client_exit_codes.md
#
# 11 - cos-image-manager malformed request
# 12 - cos-image-manager failure pulling/extracting image
# 13 - healthcheck has failed; remediation attempted
exit "$ERROR_RESPONSE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment