Skip to content

Instantly share code, notes, and snippets.

@yershalom
Created October 23, 2019 14:42
Show Gist options
  • Save yershalom/1c0776d771f0bb4533a3ab9be8bab9ae to your computer and use it in GitHub Desktop.
Save yershalom/1c0776d771f0bb4533a3ab9be8bab9ae to your computer and use it in GitHub Desktop.
Script for ES rolling restart
START_TIME=`date +%s`
# Set global vars
HOSTNAME=`hostname | cut -d"-" -f-2`
IP=`ip addr show | grep inet | grep "inet6\|127.0.0.1" -v | awk '{ print $2 }' | tail -1 | cut -d"/" -f1`
REGISTERED_SERVER_COUNT=0
I_RAISED_THE_FLAG="false"
# Verify that elasticsearch is running
curl -s -XGET localhost:9200 2>&1 | grep $HOSTNAME 2>&1 >/dev/null || exit 123
# "push" server details to rolling_restart attr, if it doesn't exists, this will create it.
echo "Adding server details to cluster's 'rolling_restart' attribute"
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude.rolling_restart\" : {\"${HOSTNAME}\": \"${IP}\"} }}" 2>&1 >/dev/null
function cleanup()
{
if [ "$I_RAISED_THE_FLAG" == "true" ]; then
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude.stop" : null }}}' 2>&1 >/dev/null
fi
echo "Removing server details from cluster's 'rolling_restart' attribute"
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude.rolling_restart\" : {\"${HOSTNAME}\": null }}}" 2>&1 >/dev/null
}
# Checking if host registered and if no other host raised a failed flag
while true; do
echo "Checking other hosts in current batch status, this will take upto 20 seconds"
CURR_TIME=`date +%s`
TIMEDIFF=$((CURR_TIME-START_TIME))
LOCALHOST_IS_REGISTERED=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "$HOSTNAME" 2>&1 >/dev/null && echo "true"`
IS_FLAG_RAISED=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart.stop"`
# Allow up to 15 seconds for all server in batch to register, if after 10 seconds the localhost couldn't register, raise a flag so other servers could stop as well
if [ "$IS_FLAG_RAISED" != "true" ] && [[ TIMEDIFF -gt 15 ]] && [ "$LOCALHOST_IS_REGISTERED" == "true" ]; then
break
elif [[ TIMEDIFF -gt 10 ]] && [ "$LOCALHOST_IS_REGISTERED" != "true" ]; then
echo "Couldn't register to 'rolling_restart' attrebute, raising flag to stop process"
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude.stop" : "true" }}' 2>&1 >/dev/null
I_RAISED_THE_FLAG="true"
elif [ "$IS_FLAG_RAISED" == "true" ] && [ "$I_RAISED_THE_FLAG" != "true" ]; then
cleanup
echo "Cleaned up, now exiting.."
exit 123
elif [ "$I_RAISED_THE_FLAG" == "true" ]; then
cleanup
echo "Cleaned up, now exiting.."
exit 123
fi
sleep 1
done
# Preper list of ip's (needs to be one-liner to be saved in var)
echo "Prepering list of IP's to exclude"
REGISTERED_SERVER_COUNT=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]" | wc -l`
runcount=0
IPLIST=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]" | cut -d":" -f2 | cut -d"\"" -f2 |
while read line; do
runcount=$((runcount+1))
if [[ runcount -eq REGISTERED_SERVER_COUNT ]]; then
echo -n "$line"
else
echo -n "$line,"
fi
done`
# Write IPLIST to exclude._ip
echo "Writing IP list to cluster's 'cluster.routing.allocation.exclude._ip' attribute"
if [ ! "$IPLIST" == "" ]; then
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude._ip\" : \"${IPLIST}\" }}" 2>&1 >/dev/null
else
exit 123
fi
# Wait for eviction
echo "Waiting for all shared to evict from dest host"
curl -s -X GET "localhost:9200/_cluster/health?wait_for_status=yellow&wait_for_no_relocating_shards=true&wait_for_no_initializing_shards=true&timeout=300s" 2>&1 >/dev/null || exit 123
# Updating new plugin (chef run)
echo "Running chef, could take a while"
a=`chef-client 2>&1 >/dev/null`
a=`chef-client 2>&1 >/dev/null`
#### restart elastic
echo "Resarting elasticsearch"
/outbrain/elasticsearch/cpuset-split-cores proc stop 2>&1 >/dev/null
sleep 5
/outbrain/elasticsearch/cpuset-split-cores proc start 2>&1 >/dev/null
####check that host is alive, if not, wait.
echo "Verifiying the host returned to cluster"
while true; do
curl -s 'localhost:9200/_cat/nodes?v' |grep `hostname` 2>&1 >/dev/null&& break
sleep 3
done
cleanup
# Check if all the expected servers did de-registered
echo "Waiting for all other servers in current batch to complete"
while true; do
REGISTERED_SERVER_COUNT=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]"`
if [ "$REGISTERED_SERVER_COUNT" == "" ] || [ "$REGISTERED_SERVER_COUNT" == "null" ]; then break; fi
sleep 3
done
# Delete IPLIST from exclude._ip
echo "Removing IP list from cluster's 'cluster.routing.allocation.exclude._ip' attribute"
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude._ip" : null }}' 2>&1 >/dev/null
# Wait for shared to return the host
echo "Waiting for all shared to return to dest host"
curl -s -X GET "localhost:9200/_cluster/health?wait_for_status=yellow&wait_for_no_relocating_shards=true&wait_for_no_initializing_shards=true&timeout=300s" 2>&1 >/dev/null || exit 123
# changed to check that the current index is green(to aviod race conditions of restart that runs with indexing)
echo "Validating the cluster is green again"
while true; do
curl -s localhost:9200/_cluster/health/outbrain_read_alias |grep '"status":"green"' 2>&1 >/dev/null && break
sleep 3
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment