Skip to content

Instantly share code, notes, and snippets.

@gehel
Created June 29, 2016 10:30
Show Gist options
  • Save gehel/3fd17d61ac8eec0326113a33f04340da to your computer and use it in GitHub Desktop.
Save gehel/3fd17d61ac8eec0326113a33f04340da to your computer and use it in GitHub Desktop.
elasticsearch restart
#!/usr/bin/env bash
set -e
es_server_prefix=elastic10
es_server_suffix=.eqiad.wmnet
first_server_index=1
nb_of_servers_in_cluster=47
# used to keep track on which server this script has already been executed
# this enables this script to be mostly idempotent, so that it can be
# relaunched as is in case of error
# WARN: needs to be changed manually
execution_id=ZplljWNP9hNobookM8fUhFuivkAKm8w3mVlQawXBg5c4JxzuEPcJcLgtY8ms0Pg
for i in $(seq -w ${first_server_index} ${nb_of_servers_in_cluster}); do
hostname="${es_server_prefix}${i}"
server="${es_server_prefix}${i}${es_server_suffix}"
if ssh ${server} grep -q ${execution_id} /var/lib/elasticsearch/script_execution_id ; then
echo "restart already executed on ${hostname}, skipping..."
continue
fi
echo "disabling alerts for ${hostname}"
ssh neon.wikimedia.org sudo icinga-downtime -h ${hostname} -d 1800 -r "restarting for config change - ${USER}"
echo "disabling replication"
until ssh ${server} es-tool stop-replication
do
echo "failed to stop replication, trying again"
done
echo "ready to start restart ${hostname}"
# echo "rebooting ${hostname}"
# ssh neodymium.eqiad.wmnet sudo salt ${server} system.reboot
# For some reason, rebooting as above does not work, host never completes shutdown
echo "You can now reboot ${server}"
echo "Press [enter] when done"
read
echo "waiting for server to be up"
until ssh ${server} true &> /dev/null; do
echo -n .
sleep 1
done
echo "server is up"
echo "waiting for elasticsearch to be started"
until ssh ${server} curl -s 127.0.0.1:9200/_cat/health; do
echo -n '.'
sleep 1
done
echo "elasticsearch is started"
echo "enabling replication"
until ssh ${server} es-tool start-replication
do
echo "failed to start replication, trying again"
done
echo "waiting for cluster recovery"
ssh ${server} "until curl -s 127.0.0.1:9200/_cat/health | grep green; do echo -n .; sleep 10; done"
echo "cluster is green"
echo "creating file to keep track of script execution"
ssh ${server} "echo ${execution_id} | sudo tee /var/lib/elasticsearch/script_execution_id"
echo "Done for ${hostname}"
echo "=============================================="
done
echo "Cluster restart completed"
echo "Cleaning up..."
for i in $(seq -w ${first_server_index} ${nb_of_servers_in_cluster}); do
hostname="${es_server_prefix}${i}"
server="${es_server_prefix}${i}${es_server_suffix}"
ssh ${server} "sudo rm /var/lib/elasticsearch/script_execution_id"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment