Forked from rodrigogansobarbieri/recover_machine_agents.py
Created
March 30, 2023 03:46
-
-
Save zhouqt/1a2fea530da5ef913af2b87d7bf1d338 to your computer and use it in GitHub Desktop.
Recover juju machine agents
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
This is a tool for recovering lost machine units in current model | |
Usage: | |
{0} model-name dest-dir | |
""" | |
# changes summary: | |
# added logic to prevent overwrite /var/lib/juju to avoid removing non-machine units | |
# added logic for detecting path of mongo binary (e.g. if present in $PATH or is in /usr/lib/juju/mongo*/bin) | |
# added logic to determine primary mongo node to run queries against PRIMARY mongo | |
# added logic to also restore systemd unit files | |
# removed need to specify controller ip, as this is set when mongo PRIMARY detected | |
import json | |
import os | |
import shlex | |
import shutil | |
import subprocess | |
import sys | |
import tempfile | |
MONGOPASS_CMD = "juju ssh ubuntu@%s \"sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf\" | awk -e '{print $2}'" | |
MONGO_CMD = "" | |
MONGO_TEMPLATE = "juju ssh -m controller ubuntu@%s \"sudo %s --port 37017 --sslAllowInvalidCertificates --ssl --authenticationDatabase admin -u machine-%s -p %s juju < /home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1" | |
def run(cmd, output=True, shell=True): | |
print(cmd) | |
if output: | |
return subprocess.check_output(cmd, shell=shell).decode().strip() | |
return subprocess.call(cmd, stderr=subprocess.PIPE, | |
stdout=subprocess.PIPE, shell=shell) | |
def update_machine_password(controller, model, machine_number, passwordhash): | |
file_content = """use juju | |
db.machines.update({"model-uuid": "%s", "machineid": "%s"}, {$set:{"passwordhash": "%s"}}) | |
""" % (model, machine_number, passwordhash) | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: | |
tmp_file.write(file_content) | |
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) | |
password = run(MONGOPASS_CMD % controller) | |
run(MONGO_CMD % (controller, password, os.path.basename(tmp_file.name))) | |
def get_model_uuid(controller, model): | |
file_content = """use juju | |
db.models.find({"name": "%s"}, {"modeluuid": 1}) | |
""" % (model) | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: | |
tmp_file.write(file_content) | |
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) | |
password = run(MONGOPASS_CMD % controller) | |
uuid_json = run(MONGO_CMD % (controller, password, os.path.basename( | |
tmp_file.name))) | |
uuid = json.loads(uuid_json)['_id'] | |
return uuid | |
def determine_primary_mongo(): | |
global MONGO_CMD | |
file_content = """use juju | |
rs.isMaster()['primary'] | |
""" | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: | |
tmp_file.write(file_content) | |
first_controller_num = run('juju machines -m controller |grep started|head -n1|cut -d " " -f1') | |
mongo_cmd_tuple = gen_mongo_cmd(first_controller_num) | |
temp_mongo_cmd = mongo_cmd_tuple[0] % os.path.basename(tmp_file.name) | |
run("juju scp -m controller {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, first_controller_num)) | |
primary_ip = run(temp_mongo_cmd).split(':')[0] | |
# now that primary established, we'll just use old logic for MONGO_CMD variable/pass | |
# but hardcoding the machine number | |
primary_controller_num = run('juju machines -m controller |grep {}|head -n1|cut -d " " -f1'.format(primary_ip)) | |
MONGO_CMD = "juju ssh ubuntu@%s \"sudo {} --port 37017 --sslAllowInvalidCertificates " \ | |
"--ssl --authenticationDatabase admin -u machine-{} -p %s juju < " \ | |
"/home/ubuntu/%s 2>/dev/null\" | tail -n 2 | head -n 1".format(mongo_cmd_tuple[1],primary_controller_num) | |
return primary_ip | |
def gen_mongo_cmd(controller_num): | |
binary_path = run('''juju ssh -m controller %s "ps aux|grep mongo"|grep -v grep|awk '{print $11}'|head -n1 | rev | cut -c 2- | rev''' % controller_num) | |
#binary_path = "mongo" | |
controller_pass = run("juju ssh -m controller %s 'sudo grep ^apipassword: /var/lib/juju/agents/machine*/agent.conf' | awk -e '{print $2}'" % controller_num ) | |
return (MONGO_TEMPLATE % (controller_num, binary_path, controller_num, controller_pass, '%s'), binary_path) | |
def get_donor_password(controller, donor, model_uuid): | |
file_content = """use juju | |
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"passwordhash": 1}) | |
""" % (model_uuid, donor) | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: | |
tmp_file.write(file_content) | |
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) | |
password = run(MONGOPASS_CMD % controller) | |
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename( | |
tmp_file.name))) | |
attributes = json.loads(attributes_json) | |
passwordhash = attributes['passwordhash'] | |
return passwordhash | |
def get_machine_nonce(controller, machine, model_uuid): | |
file_content = """use juju | |
db.machines.find({"model-uuid": "%s", "machineid": "%s"}, {"nonce": 1}) | |
""" % (model_uuid, machine) | |
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: | |
tmp_file.write(file_content) | |
print('get machine nonce reached...') | |
run("juju scp {} ubuntu@{}:/home/ubuntu".format(tmp_file.name, controller)) | |
password = run(MONGOPASS_CMD % controller) | |
attributes_json = run(MONGO_CMD % (controller, password, os.path.basename( | |
tmp_file.name))) | |
attributes = json.loads(attributes_json) | |
nonce = attributes['nonce'] | |
return nonce | |
def recover_machine(machine_number, juju_tar, donor, passwordhash, controller, model, systemd_tar): | |
print("Recovering machine {}".format(machine_number)) | |
nonce = get_machine_nonce(controller, machine_number, model) | |
check_juju_dir_or_create(machine_number) | |
run("juju scp {} {}:/home/ubuntu/juju.tar".format( | |
juju_tar, machine_number)) | |
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/juju.tar -C /var/lib --skip-old-files --keep-directory-symlink --dereference'".format(machine_number)) | |
if 'lxd' in machine_number: | |
machine_string = machine_number.replace('/', '-') | |
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format( | |
machine_number, donor, machine_string)) | |
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format( | |
machine_number, donor, machine_string)) | |
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce)) | |
else: | |
run("juju ssh {} 'sudo mv -f /var/lib/juju/agents/machine-{} /var/lib/juju/agents/machine-{}'".format( | |
machine_number, donor, machine_number)) | |
run("juju ssh {} 'sudo mv -f /var/lib/juju/tools/machine-{} /var/lib/juju/tools/machine-{}'".format( | |
machine_number, donor, machine_number)) | |
run("juju ssh {} 'echo {} | sudo tee /var/lib/juju/nonce.txt'".format(machine_number, nonce)) | |
# Update files | |
if 'lxd' in machine_number: | |
machine_string = machine_number.replace('/', '-') | |
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_string) | |
run("juju ssh {} 'sudo sed -i \"s|tag: machine-{}|tag: machine-{}|g\" {}'".format( | |
machine_number, donor, machine_string, agent_file)) | |
run("juju ssh {} 'sudo sed -i \"s|jujud-machine-{}|jujud-machine-{}|g\" {}'".format( | |
machine_number, donor, machine_string, agent_file)) | |
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format( | |
machine_number, nonce, agent_file)) | |
else: | |
agent_file = "/var/lib/juju/agents/machine-{}/agent.conf".format(machine_number) | |
run("juju ssh {} 'sudo sed -i \"s/tag: machine-{}/tag: machine-{}/g\" {}'".format( | |
machine_number, donor, machine_number, agent_file)) | |
run("juju ssh {} 'sudo sed -i \"s/jujud-machine-{}/jujud-machine-{}/g\" {}'".format( | |
machine_number, donor, machine_number, agent_file)) | |
run("juju ssh {} 'sudo sed -i \"s/nonce: .*/nonce: {}/g\" {}'".format( | |
machine_number, nonce, agent_file)) | |
# restore unit files | |
if 'lxd' in machine_number: | |
run("juju scp {} {}:/home/ubuntu/systemd.tar".format( | |
systemd_tar, machine_number)) | |
machine_string = machine_number.replace('/', '-') | |
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_string)) | |
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number)) | |
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_string)) | |
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format( | |
machine_number,donor, donor, machine_string, machine_string)) | |
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_string) | |
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_string, machine_string) | |
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format( | |
machine_number, donor, machine_string, exec_start_file)) | |
run("juju ssh {} 'sudo sed -i \"s|--machine-id {}|--machine-id {}|g\" {}'".format( | |
machine_number, donor, machine_number, exec_start_file)) | |
run("juju ssh {} 'sudo sed -i \"s|machine-{}|machine-{}|g\" {}'".format( | |
machine_number, donor, machine_string, jujud_unit_file)) | |
else: | |
run("juju scp {} {}:/home/ubuntu/systemd.tar".format( | |
systemd_tar, machine_number)) | |
run("juju ssh {} 'sudo rm -rf /lib/systemd/system/jujud-machine-{}'".format(machine_number,machine_number)) | |
run("juju ssh {} 'sudo tar -xvf /home/ubuntu/systemd.tar -C / --skip-old-files'".format(machine_number)) | |
run("juju ssh {} 'sudo cp -nrp /lib/systemd/system/jujud-machine-{} /lib/systemd/system/jujud-machine-{}'".format(machine_number,donor,machine_number)) | |
run("juju ssh {} 'sudo cp -p /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service'".format( | |
machine_number,donor, donor, machine_number, machine_number)) | |
exec_start_file = "/lib/systemd/system/jujud-machine-{}/exec-start.sh".format(machine_number) | |
jujud_unit_file = "/lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, machine_number) | |
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format( | |
machine_number, donor, machine_number, exec_start_file)) | |
run("juju ssh {} 'sudo sed -i \"s/--machine-id {}/--machine-id {}/g\" {}'".format( | |
machine_number, donor, machine_number, exec_start_file)) | |
run("juju ssh {} 'sudo sed -i \"s/machine-{}/machine-{}/g\" {}'".format( | |
machine_number, donor, machine_number, jujud_unit_file)) | |
# Update mongo | |
update_machine_password(controller, model, machine_number, passwordhash) | |
# Restart services | |
link_to = determine_juju_version(machine_number) | |
run("juju ssh {} 'for u in $(sudo ls /var/lib/juju/agents/|sort); do sudo ln -sf /var/lib/juju/tools/{} /var/lib/juju/tools/$u; done'".format(machine_number,link_to)) | |
run("juju ssh {} sudo systemctl daemon-reload".format(machine_number)) | |
if 'lxd' in machine_number: | |
machine_string = machine_number.replace('/', '-') | |
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, | |
machine_string, machine_string)) | |
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_string)) | |
else: | |
run("juju ssh {} sudo systemctl enable /lib/systemd/system/jujud-machine-{}/jujud-machine-{}.service".format(machine_number, | |
machine_number, machine_number)) | |
run("juju ssh {} sudo systemctl restart jujud-machine-{}".format(machine_number, machine_number)) | |
def check_juju_dir_or_create(machine_num): | |
command = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju'".format(machine_num) | |
exit_code = run(command, output=False) | |
if exit_code != 0: | |
create_command = "juju ssh {} 'sudo mkdir /var/lib/juju'".format(machine_num) | |
run(create_command) | |
# check for existence of /var/lib/juju/agents/machine-$ | |
# remove it to avoid mv directory not-empty when recovering | |
else: | |
if 'lxd' in machine_num: | |
machine_string = machine_num.replace('/', '-') | |
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_string) | |
else: | |
check_for_machine_conf = "juju ssh ubuntu@{} 'sudo test -d /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num) | |
if run(check_for_machine_conf, output=False) == 0: | |
if 'lxd' in machine_num: | |
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num.replace('/', '-')) | |
run(mv_existing_command, output=False) | |
else: | |
mv_existing_command = "juju ssh ubuntu@{} 'sudo rm -rf /var/lib/juju/agents/machine-{}'".format(machine_num,machine_num) | |
run(mv_existing_command, output=False) | |
def get_agent_from_donor(donor, destdir): | |
print ("Getting agent from machine {}".format(donor)) | |
# Get dir from donor and remove everything except machine agent | |
run("juju ssh {} 'sudo chmod -R a+r /var/lib/juju'".format(donor)) | |
run("juju ssh {} 'sudo tar -C /var/lib/ -cvf /tmp/juju.tar juju'".format( | |
donor)) | |
run("juju scp {}:/tmp/juju.tar {}".format(donor, destdir)) | |
run("tar -xvf {}/juju.tar -C {}".format(destdir, destdir)) | |
run("rm -rf {}/juju/agents/unit*".format(destdir)) | |
run("rm -rf {}/juju/meter-status.yaml".format(destdir)) | |
run("rm -rf {}/juju/locks/*".format(destdir)) | |
run("rm -rf {}/juju/tools/unit*".format(destdir)) | |
run("rm -rf {}/juju/metricspool".format(destdir)) | |
run("rm -rf {}/juju/nonce.txt".format(destdir)) | |
run("rm -rf {}/juju.tar".format(destdir)) | |
run("tar -cvf {}/juju.tar -C {} juju".format(destdir, destdir)) | |
run("rm -rf {}/juju".format(destdir)) | |
get_donor_systemd_units(donor,destdir) | |
def get_donor_systemd_units(donor, destdir): | |
run("juju ssh {} 'sudo tar -cvf /tmp/systemd.tar /lib/systemd/system/jujud-machine-{}'".format(donor,donor)) | |
run("juju scp {}:/tmp/systemd.tar {}".format(donor, destdir)) | |
def determine_juju_version(machine_number): | |
return run("juju ssh {} 'ls /var/lib/juju/tools |egrep ^2.|sort -n|head -n1'".format(machine_number), output=True) | |
def parse_machines_to_recover(status_json): | |
machines = [] | |
units = [] | |
for app_name, app_data in status_json['applications'].items(): | |
for unit_name,unit_data in app_data['units'].items(): | |
if unit_data['juju-status']['current'] == "lost": | |
units.append(unit_name) | |
machine = unit_data['machine'] | |
if machine not in machines: | |
machines.append(machine) | |
for machine_number, machine_data in status_json['machines'].items(): | |
if machine_data['juju-status']['current'] == "down": | |
if machine_number not in machines: | |
machines.append(machine_number) | |
return machines, units | |
def parse_donor(status_json): | |
candidates = [] | |
for machine_number, machine_data in status_json['machines'].items(): | |
if machine_data['juju-status']['current'] == "started": | |
candidates.append(machine_number) | |
for app_name, app_data in status_json['applications'].items(): | |
for unit_name,unit_data in app_data['units'].items(): | |
if unit_data['juju-status']['current'] == "lost": | |
machine = unit_data['machine'] | |
if machine in candidates: | |
candidates.remove(machine) | |
if len(candidates) > 0: | |
return candidates[0] | |
else: | |
raise Exception("No donor candidates found") | |
def disable_units(units): | |
for unit in units: | |
service = unit.replace('/','-') | |
output = run("juju ssh {} 'sudo systemctl -a | grep {}' || true".format(unit, service)) | |
if output: | |
if 'active' in output: | |
run("juju ssh {} 'sudo systemctl stop jujud-unit-{}.service'".format(unit, service)) | |
run("juju ssh {} 'sudo systemctl disable jujud-unit-{}.service'".format(unit, service)) | |
def main(): | |
model = sys.argv[1] | |
destdir = sys.argv[2] | |
print("Determining the Primary MongoDB unit") | |
controller = determine_primary_mongo() | |
if os.path.exists(destdir): | |
shutil.rmtree(destdir) | |
os.mkdir(destdir) | |
run("juju switch {}".format(model)) | |
model_uuid = get_model_uuid(controller, model) | |
print(model_uuid) | |
# Query first healthy unit to select as donor | |
print("Attempting to find a healthy machine donor") | |
status_string_json = run("juju status --format json") | |
status_json = json.loads(status_string_json) | |
donor = parse_donor(status_json) | |
passwordhash = get_donor_password(controller, donor, model_uuid) | |
print(passwordhash) | |
get_agent_from_donor(donor, destdir) | |
# slight modification to ensure it's 'Running' | |
print("Getting list of machines to recover") | |
status_string_json = run("juju status --format json") | |
status_json = json.loads(status_string_json) | |
machines, units = parse_machines_to_recover(status_json) | |
disable_units(units) | |
print("Machines to be recovered: {}".format(machines)) | |
for machine in machines: | |
# added systemd_tar | |
systemd_tar = destdir + "/systemd.tar" | |
recover_machine(machine, "{}/juju.tar".format(destdir), donor, passwordhash, controller, model_uuid, systemd_tar) | |
def test(): | |
model = sys.argv[1] | |
controller = sys.argv[2] | |
destdir = sys.argv[3] | |
if os.path.exists(destdir): | |
shutil.rmtree(destdir) | |
os.mkdir(destdir) | |
run("juju switch {}".format(model)) | |
print(determine_primary_mongo()) | |
if __name__ == "__main__": | |
if len(sys.argv) == 1 : | |
print(__doc__.format(sys.argv[0])) | |
sys.exit(-1) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment