Created
October 23, 2019 14:42
-
-
Save yershalom/1c0776d771f0bb4533a3ab9be8bab9ae to your computer and use it in GitHub Desktop.
Script for ES rolling restart
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
START_TIME=`date +%s` | |
# Set global vars | |
HOSTNAME=`hostname | cut -d"-" -f-2` | |
IP=`ip addr show | grep inet | grep "inet6\|127.0.0.1" -v | awk '{ print $2 }' | tail -1 | cut -d"/" -f1` | |
REGISTERED_SERVER_COUNT=0 | |
I_RAISED_THE_FLAG="false" | |
# Verify that elasticsearch is running | |
curl -s -XGET localhost:9200 2>&1 | grep $HOSTNAME 2>&1 >/dev/null || exit 123 | |
# "push" server details to rolling_restart attr, if it doesn't exists, this will create it. | |
echo "Adding server details to cluster's 'rolling_restart' attribute" | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude.rolling_restart\" : {\"${HOSTNAME}\": \"${IP}\"} }}" 2>&1 >/dev/null | |
function cleanup() | |
{ | |
if [ "$I_RAISED_THE_FLAG" == "true" ]; then | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude.stop" : null }}}' 2>&1 >/dev/null | |
fi | |
echo "Removing server details from cluster's 'rolling_restart' attribute" | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude.rolling_restart\" : {\"${HOSTNAME}\": null }}}" 2>&1 >/dev/null | |
} | |
# Checking if host registered and if no other host raised a failed flag | |
while true; do | |
echo "Checking other hosts in current batch status, this will take upto 20 seconds" | |
CURR_TIME=`date +%s` | |
TIMEDIFF=$((CURR_TIME-START_TIME)) | |
LOCALHOST_IS_REGISTERED=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "$HOSTNAME" 2>&1 >/dev/null && echo "true"` | |
IS_FLAG_RAISED=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart.stop"` | |
# Allow up to 15 seconds for all server in batch to register, if after 10 seconds the localhost couldn't register, raise a flag so other servers could stop as well | |
if [ "$IS_FLAG_RAISED" != "true" ] && [[ TIMEDIFF -gt 15 ]] && [ "$LOCALHOST_IS_REGISTERED" == "true" ]; then | |
break | |
elif [[ TIMEDIFF -gt 10 ]] && [ "$LOCALHOST_IS_REGISTERED" != "true" ]; then | |
echo "Couldn't register to 'rolling_restart' attrebute, raising flag to stop process" | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude.stop" : "true" }}' 2>&1 >/dev/null | |
I_RAISED_THE_FLAG="true" | |
elif [ "$IS_FLAG_RAISED" == "true" ] && [ "$I_RAISED_THE_FLAG" != "true" ]; then | |
cleanup | |
echo "Cleaned up, now exiting.." | |
exit 123 | |
elif [ "$I_RAISED_THE_FLAG" == "true" ]; then | |
cleanup | |
echo "Cleaned up, now exiting.." | |
exit 123 | |
fi | |
sleep 1 | |
done | |
# Preper list of ip's (needs to be one-liner to be saved in var) | |
echo "Prepering list of IP's to exclude" | |
REGISTERED_SERVER_COUNT=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]" | wc -l` | |
runcount=0 | |
IPLIST=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]" | cut -d":" -f2 | cut -d"\"" -f2 | | |
while read line; do | |
runcount=$((runcount+1)) | |
if [[ runcount -eq REGISTERED_SERVER_COUNT ]]; then | |
echo -n "$line" | |
else | |
echo -n "$line," | |
fi | |
done` | |
# Write IPLIST to exclude._ip | |
echo "Writing IP list to cluster's 'cluster.routing.allocation.exclude._ip' attribute" | |
if [ ! "$IPLIST" == "" ]; then | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d"{ \"transient\" : { \"cluster.routing.allocation.exclude._ip\" : \"${IPLIST}\" }}" 2>&1 >/dev/null | |
else | |
exit 123 | |
fi | |
# Wait for eviction | |
echo "Waiting for all shared to evict from dest host" | |
curl -s -X GET "localhost:9200/_cluster/health?wait_for_status=yellow&wait_for_no_relocating_shards=true&wait_for_no_initializing_shards=true&timeout=300s" 2>&1 >/dev/null || exit 123 | |
# Updating new plugin (chef run) | |
echo "Running chef, could take a while" | |
a=`chef-client 2>&1 >/dev/null` | |
a=`chef-client 2>&1 >/dev/null` | |
#### restart elastic | |
echo "Resarting elasticsearch" | |
/outbrain/elasticsearch/cpuset-split-cores proc stop 2>&1 >/dev/null | |
sleep 5 | |
/outbrain/elasticsearch/cpuset-split-cores proc start 2>&1 >/dev/null | |
####check that host is alive, if not, wait. | |
echo "Verifiying the host returned to cluster" | |
while true; do | |
curl -s 'localhost:9200/_cat/nodes?v' |grep `hostname` 2>&1 >/dev/null&& break | |
sleep 3 | |
done | |
cleanup | |
# Check if all the expected servers did de-registered | |
echo "Waiting for all other servers in current batch to complete" | |
while true; do | |
REGISTERED_SERVER_COUNT=`curl -s -XGET localhost:9200/_cluster/settings | json_pp | jq -r ".transient.cluster.routing.allocation.exclude.rolling_restart" | grep "[A-Za-z0-9]"` | |
if [ "$REGISTERED_SERVER_COUNT" == "" ] || [ "$REGISTERED_SERVER_COUNT" == "null" ]; then break; fi | |
sleep 3 | |
done | |
# Delete IPLIST from exclude._ip | |
echo "Removing IP list from cluster's 'cluster.routing.allocation.exclude._ip' attribute" | |
curl -s -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'{ "transient" : { "cluster.routing.allocation.exclude._ip" : null }}' 2>&1 >/dev/null | |
# Wait for shared to return the host | |
echo "Waiting for all shared to return to dest host" | |
curl -s -X GET "localhost:9200/_cluster/health?wait_for_status=yellow&wait_for_no_relocating_shards=true&wait_for_no_initializing_shards=true&timeout=300s" 2>&1 >/dev/null || exit 123 | |
# changed to check that the current index is green(to aviod race conditions of restart that runs with indexing) | |
echo "Validating the cluster is green again" | |
while true; do | |
curl -s localhost:9200/_cluster/health/outbrain_read_alias |grep '"status":"green"' 2>&1 >/dev/null && break | |
sleep 3 | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment