- Ansible is run interactively by me under my own credentials.
- It cannot do anything beyond what my user can already do over SSH and sudo.
- It is used for:
- Managing configuration files (for example
/etc/security/access.conf) - Installing or updating packages
- Restarting services
- Other routine administrative tasks
- Managing configuration files (for example
- Playbooks are stored in my home directory, only accessible by me and root.
- Playbooks are also stored in Git, with pull requests for review.
- Change tickets are raised before work is done, then I run the tasks using my own access.
- There is no separate privileged automation account; actions are logged as my user in system logs.
- Only I have access to the Ansible directories in my home folder (
700permissions). - Playbooks are version controlled in Git.
- Consistency across environments comes from using the same playbooks and inventories.
- Logic (such as managing
/etc/security/access.conf) is kept in a playbook underroles/<role_name>/tasks/. - Edits to playbooks go through Git pull requests for review.
- Change tickets are raised and tracked before execution.
Because Ansible runs as my user, normal system logs apply.
SSH logins:
sudo grep sshd /var/log/secure
## Sudo Commands
These show timestamp, user, and commands run with sudo:
sudo ausearch -m USER_CMD -ts today sudo grep COMMAND /var/log/secure
## List of Playbooks in Roles
List every task file under your roles:
find roles -type f -path "roles//tasks/.yml" find roles -maxdepth 1 -mindepth 1 -type d -printf '%f\n'
### Show template or task in playbooks:
cat roles/<role_name>/templates/access.conf.j2 cat roles/<role_name>/tasks/main.yml | grep access.conf -A3 -B3
## List of Playbooks in Roles
List every task file under your roles:
find roles -type f -path "roles//tasks/.yml" find roles -maxdepth 1 -mindepth 1 -type d -printf '%f\n'
## Versioning and Approval Records
- Git pull requests and commit history serve as version history and approval.
- Change management tickets exist in our ticketing system.
- There is no separate Ansible approval workflow beyond that.
## Key Talking Points
- Ansible does not bypass controls; it uses my user’s access.
- All actions are logged through normal SSH and sudo mechanisms.
- Playbooks live in a directory only I and root can access.
- Playbooks are also stored and tracked in Git, with pull request reviews.
- Configuration consistency is maintained by using the same playbooks across environments.
- Example configuration (`/etc/security/access.conf`) can be shown live.
---
#!/bin/bash
# MySQL backup to an NFS mount, only if the mount is writable
DB_NAME="your_database_name"
NFS_MOUNT="/mnt/nfs_backup"
BACKUP_DIR="$NFS_MOUNT/mysql_backups"
DATE=$(date +%F_%H-%M-%S)
BACKUP_FILE="$BACKUP_DIR/${DB_NAME}_$DATE.sql"
LOGFILE="/var/log/mysql_backup.log"
# Test if NFS mount is writable
TESTFILE="$NFS_MOUNT/.rw_test_$$"
if touch "$TESTFILE" 2>/dev/null; then
# Remove test file
rm -f "$TESTFILE"
echo "$(date) NFS is writable, starting backup to $BACKUP_FILE" >> "$LOGFILE"
# Ensure backup directory exists
mkdir -p "$BACKUP_DIR"
# Run mysqldump using credentials from /root/.my.cnf
mysqldump "$DB_NAME" > "$BACKUP_FILE"
if [ $? -eq 0 ]; then
echo "$(date) Backup successful: $BACKUP_FILE" >> "$LOGFILE"
# Optional: compress to save space
gzip "$BACKUP_FILE"
else
echo "$(date) Backup FAILED for $DB_NAME" >> "$LOGFILE"
rm -f "$BACKUP_FILE"
fi
# Optional: clean up old backups (older than 7 days)
find "$BACKUP_DIR" -type f -name "${DB_NAME}_*.sql.gz" -mtime +7 -exec rm -f {} \;
else
echo "$(date) NFS mount is read-only, backup skipped" >> "$LOGFILE"
fi
---
---
---
# save as setup_test01_known_hosts.yml
- name: Create test01 and preload known_hosts with ed25519 keys
hosts: servers
gather_facts: false
become: true
vars:
user_name: test01
peers: "{{ groups['servers'] | default(ansible_play_hosts_all) }}"
tasks:
- name: Ensure user exists with a home directory
ansible.builtin.user:
name: "{{ user_name }}"
state: present
shell: /bin/bash
create_home: true
- name: Look up home directory for test01
ansible.builtin.getent:
database: passwd
key: "{{ user_name }}"
- name: Set home path fact
ansible.builtin.set_fact:
user_home: "{{ getent_passwd[user_name].dir | default('/home/' ~ user_name) }}"
- name: Ensure .ssh exists
ansible.builtin.file:
path: "{{ user_home }}/.ssh"
state: directory
owner: "{{ user_name }}"
group: "{{ user_name }}"
mode: "0700"
- name: Ensure known_hosts file exists with correct ownership
ansible.builtin.file:
path: "{{ user_home }}/.ssh/known_hosts"
state: touch
owner: "{{ user_name }}"
group: "{{ user_name }}"
mode: "0644"
- name: Add peers to known_hosts for test01 using ed25519 only
become_user: "{{ user_name }}"
vars:
peer_host: "{{ hostvars[item].ansible_host | default(item) }}"
ansible.builtin.known_hosts:
path: "{{ user_home }}/.ssh/known_hosts"
name: "{{ item }}"
key: "{{ lookup('pipe', 'ssh-keyscan -T 5 -t ed25519 ' ~ peer_host) }}"
state: present
hash_host: false
mode: "0644"
loop: "{{ peers }}"
PRE:
#!/usr/bin/env bash
set -euo pipefail
MYSQL_SVC="mysqld"
REQUIRED_COUNT=3
sql() { mysqlsh --sql -e "$1"; }
sql_last() { sql "$1" | tail -n1; }
echo "checking cluster health"
status=$(sql_last "SELECT CASE WHEN COUNT(*)=${REQUIRED_COUNT} AND SUM(MEMBER_STATE='ONLINE')=${REQUIRED_COUNT} THEN 'OK' ELSE 'NOT_OK' END FROM performance_schema.replication_group_members;")
count=$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;")
bad_count=$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE!='ONLINE'),0) FROM performance_schema.replication_group_members;")
if [[ "$status" != "OK" ]]; then echo "cluster status not OK, got $status" >&2; exit 2; fi
if [[ "$count" != "$REQUIRED_COUNT" ]]; then echo "cluster count not $REQUIRED_COUNT, got $count" >&2; exit 3; fi
if [[ "$bad_count" != "0" ]]; then echo "some members not ONLINE, bad_count=$bad_count" >&2; exit 4; fi
this_is_primary=$(sql_last "SELECT CASE WHEN EXISTS(SELECT 1 FROM performance_schema.replication_group_members WHERE MEMBER_ID = @@server_uuid AND MEMBER_ROLE='PRIMARY') THEN 1 ELSE 0 END;")
if [[ "$this_is_primary" == "1" ]]; then
echo "this node is PRIMARY, promoting a secondary"
target=$(sql "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE!='PRIMARY' ORDER BY MEMBER_HOST LIMIT 1;" | tail -n1)
if [[ -z "$target" ]]; then echo "no secondary found to promote" >&2; exit 5; fi
mysqlsh --js -e "var c = dba.getCluster(); c.setPrimaryInstance('${target}')"
newp=$(sql_last "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE='PRIMARY';")
if [[ "$newp" != "$target" ]]; then echo "primary promotion failed, expected $target, got $newp" >&2; exit 6; fi
echo "primary is now $newp"
fi
echo "stopping mysql service for patch and reboot"
sudo systemctl stop "$MYSQL_SVC"
exit 0
POST:
#!/usr/bin/env bash
set -euo pipefail
MYSQL_SVC="mysql"
REQUIRED_COUNT=3
MYSQL_TIMEOUT=900
SLEEP=5
sql() { mysqlsh --sql -e "$1"; }
sql_last() { sql "$1" | tail -n1; }
echo "ensuring mysql service is running"
if ! systemctl is-active --quiet "$MYSQL_SVC"; then
sudo systemctl start "$MYSQL_SVC"
fi
echo "waiting for this node to report ONLINE"
t=0
is_online="NO"
while (( t < MYSQL_TIMEOUT )); do
is_online=$(sql_last "SELECT CASE WHEN EXISTS(SELECT 1 FROM performance_schema.replication_group_members WHERE MEMBER_ID = @@server_uuid AND MEMBER_STATE='ONLINE') THEN 'YES' ELSE 'NO' END;") || true
if [[ "$is_online" == "YES" ]]; then break; fi
sleep "$SLEEP"; t=$((t+SLEEP))
done
if [[ "$is_online" != "YES" ]]; then echo "this node did not reach ONLINE" >&2; exit 10; fi
echo "waiting for full cluster health"
t=0
status="NOT_OK"
while (( t < MYSQL_TIMEOUT )); do
status=$(sql_last "SELECT CASE WHEN COUNT(*)=${REQUIRED_COUNT} AND SUM(MEMBER_STATE='ONLINE')=${REQUIRED_COUNT} THEN 'OK' ELSE 'NOT_OK' END FROM performance_schema.replication_group_members;")
count=$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;")
bad_count=$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE!='ONLINE'),0) FROM performance_schema.replication_group_members;")
if [[ "$status" == "OK" ]] && [[ "$count" == "$REQUIRED_COUNT" ]] && [[ "$bad_count" == "0" ]]; then
break
fi
sleep "$SLEEP"; t=$((t+SLEEP))
done
if [[ "$status" != "OK" ]]; then echo "cluster status not OK after reboot, got $status" >&2; exit 11; fi
if [[ "$count" != "$REQUIRED_COUNT" ]]; then echo "cluster member count not $REQUIRED_COUNT after reboot, got $count" >&2; exit 12; fi
if [[ "$bad_count" != "0" ]]; then echo "members not ONLINE after reboot, bad_count=$bad_count" >&2; exit 13; fi
echo "checking recovery queues on this node"
in_queue=$(sql_last "SELECT COALESCE(COUNT_TRANSACTIONS_IN_QUEUE,0) FROM performance_schema.replication_group_member_stats WHERE MEMBER_ID = @@server_uuid;")
if [[ "$in_queue" != "0" ]]; then echo "transactions still in queue, got $in_queue" >&2; exit 14; fi
echo "post checks passed, node healthy"
exit 0
Notes and small choices
You can tune timeouts, for example 15 minutes for rejoin, and sleep interval.
Service name might be mysqld on RHEL9, adjust MYSQL_SVC accordingly.
Promotion picks the first secondary by host sort, you can constrain to a specific DC if needed.
If you want minimal console noise for BigFix parsing, remove echo lines or prefix with a simple tag.
This pair of scripts coordinates safe OS patching of Morpheus application nodes across two data centers.
They support active site with RW NFS and passive site with RO or no NFS.
Both scripts are BigFix friendly: exit code 0 = continue, any non-zero = stop and investigate.
- PRE script: proves it is safe to patch, performs health checks, enforces the correct Morpheus UI state, acquires a lock on the active site, then stops the local UI when safe.
- POST script: after reboot, waits for Morpheus UI to come up on the active site with polling, validates the whole tier, then releases the lock.
Default log file: /data/morpheus_patching/logs/morpheus_app_patch.log
Every run begins with a separator line and ends with a summary line showing exit code and whether TESTMODE was used.
- DC detection: by hostname pattern (e.g.
ukdc1orukdc2), with subnet fallback (10.10= dc1,10.11= dc2). - Site role detection:
- active if the NFS path is mounted and writable.
- passive if the NFS path is RO or not mounted.
-
Morpheus UI backend probes:
HTTPS with redirect follow, per host logging, requires HTTP200or302, and body contains marker textMorpheus.
If only short names are used (e.g.server1), setUI_DOMAIN=example.comto probehttps://server1.example.com/. -
RabbitMQ cluster check:
Parsesrabbitmqctl cluster_status --formatter json. Verifies all expected nodes for this DC are running and partitions = 0.
ConfigureRBQ_NODE_PREFIX=rabbitorrabbitmq, andRBQ_NAME_MODE=shortorfqdn. -
Elasticsearch health:
GET/_cluster/health. Acceptsgreenoryellow.
Goal: confirm safe to patch this node, then stop the UI when appropriate.
- Detect DC and build expected peer lists.
- Detect site role (active or passive).
- Active site path:
- Add jitter (2–10s) to reduce simultaneous starts.
- Probe all UI backends, require every peer OK.
- Acquire lock directory on NFS (
.patch_active_lock). Stop if lock already exists. - Verify at least one other peer is reachable on TCP 443 or 80.
- Check RabbitMQ and Elasticsearch.
- Stop local
morpheus-uiand verify it is down.
- Passive site path:
- Enforce
morpheus-uiis stopped. - Check RabbitMQ and Elasticsearch.
- Enforce
- Exit
0if all checks pass.
Goal: after reboot, bring the node up cleanly and validate the tier, then release the lock.
- Detect DC and role again.
- Active site path:
- Start
morpheus-uiif not running. - Wait for systemd to report running (poll every 30s, timeout 15m).
- Wait for local TCP on the Morpheus URL (poll every 30s, timeout 15m).
- Wait for local HTTP readiness with redirects and marker (poll every 30s, timeout 15m). Logs code and marker each attempt.
- Probe all UI backends, require every peer OK.
- Check RabbitMQ and Elasticsearch.
- Release NFS lock (only if owned by this host).
- Start
- Passive site path:
- Enforce
morpheus-uiis stopped. - Check RabbitMQ and Elasticsearch.
- Enforce
- Exit
0if all checks pass.
- Run PRE on the target node.
- Exit
0: continue with OS patch + reboot. - Exit non-zero: stop and investigate.
- Exit
- Patch and reboot.
- Run POST on the same node.
- Exit
0: node healthy, move to next. - Exit non-zero: stop and investigate.
- Exit
- All-backends check before taking down a node.
- NFS lock ensures only one active node patches at a time.
- Jitter reduces race conditions.
- Passive site enforcement (UI must be stopped, cluster checks still run).
- Robust POST polling (up to 15m for service, TCP, HTTP readiness).
Common examples. Summary line always logs the exit code.
0: success.5,33: UI backends not all healthy.6: could not acquire NFS lock.7: no peer UI reachable.8,34: RabbitMQ check failed.9,35: Elasticsearch check failed.10,40: Morpheus UI could not be stopped.30: service did not report running within timeout.31: local TCP not reachable within timeout.32: local HTTP not ready within timeout.20–21: DC detection error.
NFS_PATH: shared path used for role detection and lock directory.LOCK_DIR_PATH: defaultNFS_PATH/.patch_active_lock.APP_NODES_DC1,APP_NODES_DC2: host lists per DC.RMQ_NODES_DC1_OVERRIDE,RMQ_NODES_DC2_OVERRIDE: optional RabbitMQ node lists.RBQ_NODE_PREFIX:rabbitorrabbitmq.RBQ_NAME_MODE:shortorfqdn.UI_DOMAIN: append domain for UI probes.UI_PRECHECK_JITTER_MIN,UI_PRECHECK_JITTER_MAX: jitter window.UI_PROBE_TIMEOUT: timeout per UI probe.UI_PROBE_RETRIES: retries per UI probe.MORPHEUS_URL: local URL for POST checks (defaulthttps://127.0.0.1).UI_WAIT_TIMEOUT: POST readiness timeout (default 900s).UI_WAIT_INTERVAL: POST poll interval (default 30s).ES_URL: Elasticsearch health URL.ES_OK_STATUSES: defaultgreen,yellow.LOG_DIR,LOG_FILE: default log paths.
Set TESTMODE=1 to perform a dry run.
State-changing actions log what they would do but are skipped.
Health checks still run and log their results.
Examples:
sudo TESTMODE=1 /usr/local/sbin/morph-app-pre.sh
sudo TESTMODE=1 /usr/local/sbin/morph-app-post.sh
The log shows a TESTMODE banner and summary line like:
[SUMMARY] Script exiting with rc=0 TESTMODE=1
How to Run
Active site, normal run (as root):
sudo /usr/local/sbin/morph-app-pre.sh
sudo /usr/local/sbin/morph-app-post.sh
Tail the log:
tail -f /data/morpheus_patching/logs/morpheus_app_patch.log
-
UI probes fail with code 302 + marker absent
AddUI_DOMAINso probe uses FQDN, confirm-Lis present to follow redirects. -
RabbitMQ CLI not found
SetRABBITMQCTL_PATH(e.g./opt/morpheus/embedded/bin/rabbitmqctl) and source RabbitMQ profile if required. -
Lock never releases
Run POST on the same node that took the lock. Script only releases if theownerfile matches. -
POST ends immediately
ConfirmUI_WAIT_TIMEOUT/UI_WAIT_INTERVAL. Logs should show waiting lines. -
No log file updates
Check write permission to/data/morpheus_patching/logs. Ensuretee -ainlog()works.
Q: Why both UI backend check and NFS lock on active site?
A: The UI check prevents bringing down unhealthy nodes, the lock ensures only one node patches at once.
Q: How are passive nodes serialized?
A: Scripts enforce UI stopped and validate cluster, but no RW NFS = no lock. BigFix should schedule passive nodes one at a time.
Q: Why accept Elasticsearch yellow?
A: Expected during rolling patch (all primaries allocated, some replicas missing). Green only would block unnecessarily.
Q: What if RabbitMQ nodes are rabbitmq@host?
A: Set RBQ_NODE_PREFIX=rabbitmq. Use RBQ_NAME_MODE=fqdn if cluster reports FQDNs.
Q: Can we dry run the flow?
A: Yes, set TESTMODE=1. All health checks run, state changes log as “would do”.
Q: How do we know UI is really back?
A: POST waits for systemd running, TCP open, and HTTP 200/302 + marker. Then probes all backends for full readiness.
This repository contains the pre and post patching scripts for a MySQL InnoDB Cluster.
They are designed to be run automatically by BigFix during rolling OS patching of the cluster nodes.
The scripts ensure that patching is performed safely, with only one node patched at a time.
Scripts are deployed under:
/usr/local/sbin/mysqldb-pre.sh
/usr/local/sbin/mysqldb-post.sh
They must be executable (chmod +x).
-
PRE script runs before patching:
- Confirms it is safe to take down the node.
- Acquires a lock on the active DC to serialize patching.
- If the node is the cluster PRIMARY, it promotes a secondary first.
- Applies a small random jitter to reduce race conditions.
- Stops local MySQL service if all checks pass.
- Exit code
0= safe to continue, otherwise non-zero = stop patching.
-
OS patching and reboot are applied by BigFix.
-
POST script runs after reboot:
- Starts MySQL if it is not already running.
- Waits for local SQL connectivity.
- Waits for this node to rejoin the cluster as
ONLINE. - Waits for all cluster members to be
ONLINE. - Ensures recovery queues are empty before proceeding.
- Releases the NFS lock on the active DC.
- Exit code
0= node healthy, otherwise non-zero = stop patching.
BigFix moves to the next node in the cluster only if both PRE and POST exit successfully.
The scripts include several safeguards to protect cluster health:
-
Cluster health checks
- Confirm all 3 members are present.
- Require all members to be
ONLINE.
-
Primary migration
- If the current node is
PRIMARY, the PRE script promotes a secondary before shutdown.
- If the current node is
-
NFS lock (active site only)
- A lock directory is created on a shared NFS mount:
${NFS_PATH:-/var/opt/morpheus/morpheus-ui}/${LOCK_DIR_NAME:-.db_patch_active_lock} - Ensures only one active node is patched at a time.
- Lock includes an
ownerfile with hostname and timestamp. - The POST script releases the lock when finished.
- Passive site nodes (NFS RO) skip lock logic.
- A lock directory is created on a shared NFS mount:
-
Random jitter
- PRE script adds a random 2–10s delay before stopping MySQL.
- Reduces the chance of simultaneous stops if two PRE scripts start at once.
-
Post-patch validation
- POST waits up to 20 minutes for MySQL and cluster recovery.
- Requires node
ONLINE, all membersONLINE, and empty replication queues.
Environment variables (with defaults):
NFS_PATH: NFS mount path used to detect role and hold lock.
Default:/var/opt/morpheus/morpheus-uiLOCK_DIR_NAME: Lock directory name.
Default:.db_patch_active_lockMYSQL_SVC: MySQL systemd service name.
Default:mysqlREQUIRED_COUNT: Expected number of cluster members.
Default:3MYSQL_TIMEOUT: Timeout in seconds for post-patch polling.
Default:1200(20 minutes)SLEEP: Polling interval in seconds.
Default:5TESTMODE: Set to1for dry run.
Default:0
Set TESTMODE=1 to run the scripts in simulation mode.
- Health checks still run and output results.
- State-changing actions (lock acquire/release, primary migration, service stop/start) are logged as “would do” but skipped.
- Exit code reflects health checks.
Examples:
TESTMODE=1 NFS_PATH=/var/opt/morpheus/morpheus-ui ./mysqldb-pre.sh
TESTMODE=1 NFS_PATH=/var/opt/morpheus/morpheus-ui ./mysqldb-post.sh
Active site, normal run (as root):
$ ./mysqldb-pre.sh
$ ./mysqldb-post.sh
The scripts return specific non-zero exit codes to BigFix when checks fail.
This helps operators quickly identify the failure stage.
0– Success, safe to proceed.
Pre script error codes
2– Cluster status not OK.3– Cluster member count not equal toREQUIRED_COUNT.4– One or more members notONLINE.5– No secondary found to promote when current node is PRIMARY.6– Primary promotion failed (new primary did not match expected).7– MySQL already stopped, refusing to proceed.20– Cluster changed during jitter window, aborting stop.95– Could not acquire NFS lock on active site.
Post script error codes
9– SQL did not become reachable after reboot within timeout.10– This node did not reachONLINE.11– Cluster not fully healthy (not all membersONLINE).14– Recovery queue not empty.
Use these codes in combination with logs to determine the root cause.
-
PRE fails with lock present Another active node is already patching. Wait for that node’s POST to release the lock.
-
POST times out waiting for SQL Check mysqld logs for startup errors. Ensure MYSQL_SVC is correct.
-
POST says node not ONLINE Run:
mysqlsh --sql -e "SELECT MEMBER_HOST,MEMBER_STATE FROM performance_schema.replication_group_members;"
on another node to check sync status.
-
Recovery queue not empty Node may still be catching up. Wait and rerun POST.
-
Passive nodes still try to stop MySQL Confirm NFS_PATH is correct and really RO on the passive site.
Q: Why both cluster checks and NFS lock? A: Cluster checks ensure the cluster is healthy before taking down a node, the lock ensures only one active node patches at a time.
Q: What happens on the passive site? A: The scripts detect RO or missing NFS, classify the node as passive, skip lock logic, and still enforce health checks.
Q: Can we dry run the flow? A: Yes, set TESTMODE=1. All health checks run, actions log as “would do”, MySQL and locks are untouched.
Q: How do we know the node is really back? A: POST waits for SQL connectivity, node ONLINE, full cluster ONLINE, and empty recovery queue before exiting 0.
#!/usr/bin/env bash
# POST script for Morpheus application nodes
# Purpose, after reboot on the active site ensure local UI is ready, verify whole tier health, then release NFS lock
# On the passive site enforce morpheus-ui is stopped and verify RabbitMQ and Elasticsearch
set -euo pipefail
# ---------- config ----------
MORPHEUS_SVC="morpheus-ui"
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME=".patch_active_lock"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# Logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/morpheus_app_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
log(){
printf "%s %s\n" "$(date '+%F %T')" "$*" | tee -a "$LOG_FILE";
}
# Always log exit status for BigFix visibility
trap 'rc=$?; log "[SUMMARY] Script exiting with rc=$rc TESTMODE=${TESTMODE:-0}"; exit $rc' EXIT
printf "\n#-------------------------------#\n" | tee -a "$LOG_FILE"
log "Starting $(basename "$0")"
# Local UI readiness
MORPHEUS_URL="${MORPHEUS_URL:-https://127.0.0.1}"
UI_WAIT_TIMEOUT="${UI_WAIT_TIMEOUT:-900}" # 15 minutes
UI_WAIT_INTERVAL="${UI_WAIT_INTERVAL:-30}" # 30 seconds
# UI backend probes
UI_PROBE_TIMEOUT="${UI_PROBE_TIMEOUT:-5}" # seconds per host
UI_PROBE_RETRIES="${UI_PROBE_RETRIES:-2}" # retries per host
UI_DOMAIN="${UI_DOMAIN:-}" # if set and host has no dot, append this domain
# Elasticsearch
ES_URL="${ES_URL:-http://127.0.0.1:9200/_cluster/health}"
ES_OK_STATUSES="${ES_OK_STATUSES:-green,yellow}"
# RabbitMQ
RABBITMQCTL_PATH="${RABBITMQCTL_PATH:-/opt/morpheus/embedded/bin/rabbitmqctl}"
RABBIT_PROFILE="${RABBIT_PROFILE:-/var/lib/rabbitmq/.profile}"
RBQ_NODE_PREFIX="${RBQ_NODE_PREFIX:-rabbit}" # set to rabbitmq if your cluster reports rabbitmq@host
RBQ_NAME_MODE="${RBQ_NAME_MODE:-short}" # short uses hostname short form, fqdn uses full
# App nodes per DC, set to your real hosts or FQDNs
APP_NODES_DC1="${APP_NODES_DC1:-ukdc1-mph-01.example.com ukdc1-mph-02.example.com ukdc1-mph-03.example.com}"
APP_NODES_DC2="${APP_NODES_DC2:-ukdc2-mph-01.example.com ukdc2-mph-02.example.com ukdc2-mph-03.example.com}"
# Optional overrides if RabbitMQ runs only on a subset
RMQ_NODES_DC1_OVERRIDE="${RMQ_NODES_DC1_OVERRIDE:-}"
RMQ_NODES_DC2_OVERRIDE="${RMQ_NODES_DC2_OVERRIDE:-}"
# RabbitMQ wait settings, minimal addition
RABBIT_WAIT_TIMEOUT="${RABBIT_WAIT_TIMEOUT:-180}" # seconds to wait for 3 DC nodes
RABBIT_WAIT_INTERVAL="${RABBIT_WAIT_INTERVAL:-5}" # seconds between checks
# Test mode
TESTMODE="${TESTMODE:-0}"
if [[ "$TESTMODE" == "1" ]]; then
echo "########################################################" | tee -a "$LOG_FILE"
echo "### RUNNING IN TESTMODE ###" | tee -a "$LOG_FILE"
echo "### No changes will be made by this script now ###" | tee -a "$LOG_FILE"
echo "########################################################" | tee -a "$LOG_FILE"
log "TESTMODE is ON"
fi
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
have(){ command -v "$1" >/dev/null 2>&1; }
detect_dc() {
local h; h="$(short_host)"
case "$h" in
ukdc1*|*dc1*|*DC1*) echo "dc1"; return 0 ;;
ukdc2*|*dc2*|*DC2*) echo "dc2"; return 0 ;;
esac
ip -o -4 addr show | grep -qE '10\.10\.' && { echo "dc1"; return 0; }
ip -o -4 addr show | grep -qE '10\.11\.' && { echo "dc2"; return 0; }
echo ""; return 1
}
detect_role() {
if ! mountpoint -q "$NFS_PATH"; then
echo "passive"; return 0;
fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then
rm -f "$tf" || true; echo "active";
else
echo "passive";
fi
}
make_rmq_nodes() {
local list="$1" out="" h
for h in $list; do
if [[ "$RBQ_NAME_MODE" = "fqdn" ]]; then
out+="${RBQ_NODE_PREFIX}@${h} "
else
out+="${RBQ_NODE_PREFIX}@${h%%.*} "
fi
done
echo "$out"
}
svc_running() {
if command -v morpheusctl >/dev/null 2>&1; then
morpheusctl status "$MORPHEUS_SVC" 2>/dev/null | grep -qi "running"
elif command -v morpheus-ctl >/dev/null 2>&1; then
morpheus-ctl status "$MORPHEUS_SVC" 2>/dev/null | grep -qi "run"
else
systemctl is-active --quiet "$MORPHEUS_SVC"
fi
}
svc_start() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would start $MORPHEUS_SVC"; return 0;
fi
if command -v morpheusctl >/dev/null 2>&1; then
morpheusctl start "$MORPHEUS_SVC"
elif command -v morpheus-ctl >/dev/null 2>&1; then
morpheus-ctl start "$MORPHEUS_SVC"
else
systemctl start "$MORPHEUS_SVC"
fi
}
wait_for_service_running() {
local deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
while (( $(date +%s) < deadline )); do
if svc_running; then
return 0;
fi
log "Waiting for $MORPHEUS_SVC service, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
return 1
}
wait_for_tcp() {
local host port deadline now
host="$(printf '%s' "$MORPHEUS_URL" | sed -E 's#^https?://##; s#/.*$##')"
port=443
[[ "$MORPHEUS_URL" =~ ^http:// ]] && port=80
deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
while :; do
if timeout 2 bash -lc ">/dev/tcp/$host/$port" 2>/dev/null; then
return 0;
fi
now=$(date +%s); (( now >= deadline )) && return 1
log "Waiting for TCP ${host}:${port}, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
}
# New, poll HTTP until ready or timeout
wait_for_http_ready() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would poll local UI at $MORPHEUS_URL for readiness"
return 0
fi
local deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
local code tmp body mark
while (( $(date +%s) < deadline )); do
tmp="/tmp/morph_ui.$$"
code="$(curl -k -s -L -o "$tmp" -w '%{http_code}' "$MORPHEUS_URL/")" || code="000"
body="$(tr -d '\r' <"$tmp" | head -c 2000 || true)"; rm -f "$tmp" || true
mark="absent"
if grep -qi "Morpheus" <<<"$body"; then mark="present"; fi
if { [[ "$code" == "200" || "$code" == "302" ]] && [[ "$mark" == "present" ]]; }; then
log "Local UI ready, http_code=${code} marker=${mark}"
return 0
fi
log "Local UI not ready yet, http_code=${code} marker=${mark}, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
return 1
}
# Cluster UI probe with redirects and optional domain
probe_ui_host() {
local host="$1" fqdn url code tmp="/tmp/morph_ui.$$" body mark="absent"
fqdn="$host"
if [[ -n "$UI_DOMAIN" && "$host" != *.* ]]; then
fqdn="${host}.${UI_DOMAIN}"
fi
url="https://${fqdn}/"
code="$(curl -k -s -L -m "$UI_PROBE_TIMEOUT" -o "$tmp" -w '%{http_code}' "$url")" || {
rm -f "$tmp"
log "[UI] ${fqdn} http_code=none marker=absent result=FAIL, curl error"
return 1
}
body="$(tr -d '\r' <"$tmp" | head -c 4000 || true)"; rm -f "$tmp"
if grep -qi "Morpheus" <<<"$body"; then mark="present"; fi
if [[ "$code" == "200" || "$code" == "302" ]] && [[ "$mark" == "present" ]]; then
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=OK"
return 0
fi
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=FAIL"
log "[UI] ${fqdn} body(head)=$(printf '%s' "$body" | tr -d '\n' | head -c 120)"
return 1
}
check_all_ui_backends_up() {
local peers="$1" ok=0 total=0 bad_list="" h short i
for h in $peers; do
short="${h%%.*}"; total=$((total+1))
log "[UI] probing ${short}"
for (( i=0; i<=UI_PROBE_RETRIES; i++ )); do
if probe_ui_host "$short"; then ok=$((ok+1)); break; fi
sleep 1
done
if (( i > UI_PROBE_RETRIES )); then bad_list+="$short "; fi
done
log "[Active post] UI backends summary, ok=${ok} total=${total} bad=${bad_list:-none}"
(( ok == total ))
}
rmq_cluster_json() {
[[ -x "$RABBITMQCTL_PATH" ]] || { log "rabbitmqctl not executable at $RABBITMQCTL_PATH"; return 1; }
bash -lc "[[ -f '$RABBIT_PROFILE' ]] && source '$RABBIT_PROFILE' || true; '$RABBITMQCTL_PATH' -q cluster_status --formatter json"
}
check_rabbitmq_dc() {
local expect_nodes="$1" json running partitions missing=""
json="$(rmq_cluster_json)" || { log "rabbitmq cluster_status failed"; return 1; }
if have jq; then
running="$(printf '%s' "$json" | jq -r '.running_nodes[]? // empty' | xargs echo)"
partitions="$(printf '%s' "$json" | jq -r '.partitions | length')"
else
running="$(printf '%s' "$json" | sed -n 's/.*"running_nodes":[[]\([^]]]*\)[]].*/\1/p' | tr -d '"' | tr ',' ' ')"
partitions="$(printf '%s' "$json" | sed -n 's/.*"partitions":[[]\([^]]]*\)[]].*/\1/p' | wc -w | tr -d ' ')"
fi
log "RabbitMQ running nodes, $running"
[[ "$partitions" =~ ^[0-9]+$ ]] || partitions=0
if [[ "$partitions" -gt 0 ]]; then
log "RabbitMQ partitions detected, $partitions"; return 1;
fi
for n in $expect_nodes; do
grep -qw -- "$n" <<<"$running" || missing+="$n ";
done
[[ -z "$missing" ]] || { log "Missing expected RMQ nodes, $missing"; return 1; }
return 0
}
# Minimal addition, poll until at least 3 expected DC nodes are running and no partitions
wait_for_rabbit_three_in_dc() {
local expect_nodes="$1" deadline now json running partitions present=0 missing=""
deadline=$(( $(date +%s) + RABBIT_WAIT_TIMEOUT ))
while :; do
json="$(rmq_cluster_json)" || { log "rabbitmq cluster_status failed, will retry"; json=""; }
if [[ -n "$json" ]]; then
if have jq; then
running="$(printf '%s' "$json" | jq -r '.running_nodes[]? // empty' | xargs echo)"
partitions="$(printf '%s' "$json" | jq -r '.partitions | length')"
else
running="$(printf '%s' "$json" | sed -n 's/.*"running_nodes":[[]\([^]]]*\)[]].*/\1/p' | tr -d '"' | tr ',' ' ')"
partitions="$(printf '%s' "$json" | sed -n 's/.*"partitions":[[]\([^]]]*\)[]].*/\1/p' | wc -w | tr -d ' ')"
fi
[[ "$partitions" =~ ^[0-9]+$ ]] || partitions=0
present=0
missing=""
for n in $expect_nodes; do
if grep -qw -- "$n" <<<"$running"; then
present=$((present+1))
else
missing+="$n "
fi
done
log "RMQ wait, present=$present needed=3 partitions=$partitions missing=${missing:-none}"
if (( present >= 3 )) && [[ "$partitions" -eq 0 ]]; then
log "RMQ cluster has at least 3 expected DC nodes and no partitions"
return 0
fi
fi
now=$(date +%s)
if (( now >= deadline )); then
log "RMQ cluster did not reach 3 DC nodes within ${RABBIT_WAIT_TIMEOUT}s, present=$present missing=${missing:-none} partitions=${partitions:-unknown}"
return 1
fi
sleep "$RABBIT_WAIT_INTERVAL"
done
}
check_elastic() {
local status
status="$(curl -fsS "$ES_URL" 2>/dev/null | sed -n 's/.*\"status\":\"\([^\"]*\)\".*/\1/p')" || true
[[ -n "$status" ]] || { log "Elasticsearch query failed at $ES_URL"; return 1; }
log "Elasticsearch status, $status"
[[ ",$ES_OK_STATUSES," == *",$status,"* ]]
}
wait_for_elastic_ready() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would wait for Elasticsearch readiness"
return 0
fi
local deadline=$(( $(date +%s) + ES_WAIT_TIMEOUT ))
while (( $(date +%s) < deadline )); do
if check_elastic; then
return 0
fi
log "Elasticsearch not ready yet, retrying in ${ES_WAIT_INTERVAL}s"
sleep "$ES_WAIT_INTERVAL"
done
log "Elasticsearch did not reach an OK status within ${ES_WAIT_TIMEOUT}s"
return 1
}
lock_owned_by_me(){
[[ -f "$LOCK_DIR_PATH/owner" ]] && grep -q "host=$(short_host)" "$LOCK_DIR_PATH/owner";
}
lock_release() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would release active patch lock at $LOCK_DIR_PATH"
return 0
fi
if lock_owned_by_me; then
rm -rf "$LOCK_DIR_PATH" || true
log "Released active patch lock"
else
log "Not releasing lock, not owned by this host"
fi
}
# ---------- main ----------
DC="$(detect_dc || true)" || { log "Cannot detect DC"; exit 20; }
case "$DC" in
dc1)
EXPECT_RMQ_NODES="${RMQ_NODES_DC1_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC1")}"
APP_PEERS="$APP_NODES_DC1"
;;
dc2)
EXPECT_RMQ_NODES="${RMQ_NODES_DC2_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC2")}"
APP_PEERS="$APP_NODES_DC2"
;;
*)
log "Unknown DC"; exit 21 ;;
esac
log "Detected DC, $DC"
log "Expected RMQ nodes, $EXPECT_RMQ_NODES"
log "APP_PEERS, $APP_PEERS"
ROLE="$(detect_role)"
log "Detected site role, $ROLE"
if [[ "$ROLE" == "active" ]]; then
if ! svc_running; then
log "Starting Morpheus UI on active node"
svc_start
else
log "Morpheus UI already running"
fi
if [[ "$TESTMODE" != "1" ]]; then
log "Waiting for $MORPHEUS_SVC to report running, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_service_running; then
log "Morpheus service did not report running within ${UI_WAIT_TIMEOUT}s"
exit 30
fi
log "Waiting for local UI TCP on $MORPHEUS_URL, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_tcp; then
log "Local UI TCP not reachable within ${UI_WAIT_TIMEOUT}s"
exit 31
fi
log "Waiting for local UI HTTP readiness on $MORPHEUS_URL, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_http_ready; then
log "Local UI did not reach HTTP readiness within ${UI_WAIT_TIMEOUT}s"
exit 32
fi
else
log "TESTMODE, would wait for service, TCP and HTTP readiness"
fi
log "[Active post] Checking that all UI backends are healthy"
check_all_ui_backends_up "$APP_PEERS" || { log "Not all UI backends are healthy after reboot"; exit 33; }
log "[Active post] RabbitMQ, waiting for 3 DC nodes"
wait_for_rabbit_three_in_dc "$EXPECT_RMQ_NODES" || exit 34
log "[Active post] RabbitMQ final check"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 34
log "[Active post] Elasticsearch, waiting for readiness"
wait_for_elastic_ready || exit 35
lock_release
log "Active POST checks passed, node healthy and lock released"
exit 0
else
if svc_running; then
log "Stopping Morpheus UI on passive node"
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would stop $MORPHEUS_SVC"
else
if command -v morpheusctl >/dev/null 2>&1; then
morpheusctl stop "$MORPHEUS_SVC"
else
systemctl stop "$MORPHEUS_SVC"
fi
sleep 2
fi
fi
if [[ "$TESTMODE" != "1" ]] && svc_running; then
log "Morpheus UI still running on passive after stop"
exit 40
fi
log "[Passive post] RabbitMQ, waiting for 3 DC nodes"
wait_for_rabbit_three_in_dc "$EXPECT_RMQ_NODES" || exit 41
log "[Passive post] RabbitMQ final check"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 41
log "[Passive post] Elasticsearch, waiting for readiness"
wait_for_elastic_ready || exit 42
log "Passive POST checks passed, node healthy"
exit 0
fi
#!/usr/bin/env bash
# POST script for Morpheus application nodes
# Purpose, after reboot on the active site ensure local UI is ready, verify whole tier health, then release NFS lock
# On the passive site enforce morpheus-ui is stopped and verify RabbitMQ and Elasticsearch
set -euo pipefail
# ---------- config ----------
MORPHEUS_SVC="morpheus-ui"
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME=".patch_active_lock"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# Logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/morpheus_app_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
log(){
printf "%s %s\n" "$(date '+%F %T')" "$*" | tee -a "$LOG_FILE";
}
# Always log exit status for BigFix visibility
trap 'rc=$?; log "[SUMMARY] Script exiting with rc=$rc TESTMODE=${TESTMODE:-0}"; exit $rc' EXIT
printf "\n#-------------------------------#\n" | tee -a "$LOG_FILE"
log "Starting $(basename "$0")"
# Local UI readiness
MORPHEUS_URL="${MORPHEUS_URL:-https://127.0.0.1}"
UI_WAIT_TIMEOUT="${UI_WAIT_TIMEOUT:-900}" # 15 minutes
UI_WAIT_INTERVAL="${UI_WAIT_INTERVAL:-30}" # 30 seconds
# UI backend probes
UI_PROBE_TIMEOUT="${UI_PROBE_TIMEOUT:-5}" # seconds per host
UI_PROBE_RETRIES="${UI_PROBE_RETRIES:-2}" # retries per host
UI_DOMAIN="${UI_DOMAIN:-}" # if set and host has no dot, append this domain
# Elasticsearch
ES_URL="${ES_URL:-http://127.0.0.1:9200/_cluster/health}"
ES_OK_STATUSES="${ES_OK_STATUSES:-green,yellow}"
# Elasticsearch wait settings
ES_WAIT_TIMEOUT="${ES_WAIT_TIMEOUT:-180}" # total seconds to wait
ES_WAIT_INTERVAL="${ES_WAIT_INTERVAL:-5}" # seconds between attempts
# RabbitMQ
RABBITMQCTL_PATH="${RABBITMQCTL_PATH:-/opt/morpheus/embedded/bin/rabbitmqctl}"
RABBIT_PROFILE="${RABBIT_PROFILE:-/var/lib/rabbitmq/.profile}"
RBQ_NODE_PREFIX="${RBQ_NODE_PREFIX:-rabbit}" # set to rabbitmq if your cluster reports rabbitmq@host
RBQ_NAME_MODE="${RBQ_NAME_MODE:-short}" # short uses hostname short form, fqdn uses full
# RabbitMQ wait settings
RABBIT_WAIT_TIMEOUT="${RABBIT_WAIT_TIMEOUT:-180}" # seconds to wait for 3 DC nodes
RABBIT_WAIT_INTERVAL="${RABBIT_WAIT_INTERVAL:-5}" # seconds between checks
# App nodes per DC, set to your real hosts or FQDNs
APP_NODES_DC1="${APP_NODES_DC1:-ukdc1-mph-01.example.com ukdc1-mph-02.example.com ukdc1-mph-03.example.com}"
APP_NODES_DC2="${APP_NODES_DC2:-ukdc2-mph-01.example.com ukdc2-mph-02.example.com ukdc2-mph-03.example.com}"
# Optional overrides if RabbitMQ runs only on a subset
RMQ_NODES_DC1_OVERRIDE="${RMQ_NODES_DC1_OVERRIDE:-}"
RMQ_NODES_DC2_OVERRIDE="${RMQ_NODES_DC2_OVERRIDE:-}"
# Service stop waits
SVC_STOP_TIMEOUT="${SVC_STOP_TIMEOUT:-60}"
SVC_STOP_INTERVAL="${SVC_STOP_INTERVAL:-3}"
# Test mode
TESTMODE="${TESTMODE:-0}"
if [[ "$TESTMODE" == "1" ]]; then
echo "########################################################" | tee -a "$LOG_FILE"
echo "### RUNNING IN TESTMODE ###" | tee -a "$LOG_FILE"
echo "### No changes will be made by this script now ###" | tee -a "$LOG_FILE"
echo "########################################################" | tee -a "$LOG_FILE"
log "TESTMODE is ON"
fi
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
have(){ command -v "$1" >/dev/null 2>&1; }
detect_dc() {
local h; h="$(short_host)"
case "$h" in
ukdc1*|*dc1*|*DC1*) echo "dc1"; return 0 ;;
ukdc2*|*dc2*|*DC2*) echo "dc2"; return 0 ;;
esac
ip -o -4 addr show | grep -qE '10\.10\.' && { echo "dc1"; return 0; }
ip -o -4 addr show | grep -qE '10\.11\.' && { echo "dc2"; return 0; }
echo ""; return 1
}
detect_role() {
if ! mountpoint -q "$NFS_PATH"; then
echo "passive"; return 0;
fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then
rm -f "$tf" || true; echo "active";
else
echo "passive";
fi
}
make_rmq_nodes() {
local list="$1" out="" h
for h in $list; do
if [[ "$RBQ_NAME_MODE" = "fqdn" ]]; then
out+="${RBQ_NODE_PREFIX}@${h} "
else
out+="${RBQ_NODE_PREFIX}@${h%%.*} "
fi
done
echo "$out"
}
# Get a short, human friendly service status line for logging
svc_status_head() {
if command -v morpheus-ctl >/dev/null 2>&1; then
morpheus-ctl status "$MORPHEUS_SVC" 2>&1 | head -n1
elif command -v systemctl >/dev/null 2>&1; then
systemctl status "$MORPHEUS_SVC" --no-pager 2>&1 | sed -n '1,1p'
else
echo "status tool not found"
fi
}
# Return 0 if service is running, 1 otherwise, quiet
svc_running() {
if command -v morpheus-ctl >/dev/null 2>&1; then
# Morpheus runit style output often includes either running or run:
morpheus-ctl status "$MORPHEUS_SVC" 2>/dev/null | grep -qiE '\brunning\b|^run:'
elif command -v systemctl >/dev/null 2>&1; then
systemctl is-active --quiet "$MORPHEUS_SVC"
else
return 1
fi
}
svc_start() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would start $MORPHEUS_SVC"
return 0
fi
if command -v morpheus-ctl >/dev/null 2>&1; then
log "Starting service with morpheus-ctl"
morpheus-ctl start "$MORPHEUS_SVC"
elif command -v systemctl >/dev/null 2>&1; then
log "Starting service with systemctl"
systemctl start "$MORPHEUS_SVC"
else
log "No service manager found to start $MORPHEUS_SVC"
return 1
fi
}
svc_stop() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would stop $MORPHEUS_SVC"
return 0
fi
if command -v morpheus-ctl >/dev/null 2>&1; then
log "Stopping service with morpheus-ctl"
morpheus-ctl stop "$MORPHEUS_SVC"
elif command -v systemctl >/dev/null 2>&1; then
log "Stopping service with systemctl"
systemctl stop "$MORPHEUS_SVC"
else
log "No service manager found to stop $MORPHEUS_SVC"
return 1
fi
}
wait_for_service_running() {
local deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
while (( $(date +%s) < deadline )); do
if svc_running; then
return 0
fi
log "Waiting for $MORPHEUS_SVC service, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
return 1
}
wait_for_service_stopped() {
local deadline=$(( $(date +%s) + SVC_STOP_TIMEOUT ))
while (( $(date +%s) < deadline )); do
if ! svc_running; then
return 0
fi
sleep "$SVC_STOP_INTERVAL"
done
return 1
}
wait_for_tcp() {
local host port deadline now
host="$(printf '%s' "$MORPHEUS_URL" | sed -E 's#^https?://##; s#/.*$##')"
port=443
[[ "$MORPHEUS_URL" =~ ^http:// ]] && port=80
deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
while :; do
if timeout 2 bash -lc ">/dev/tcp/$host/$port" 2>/dev/null; then
return 0
fi
now=$(date +%s); (( now >= deadline )) && return 1
log "Waiting for TCP ${host}:${port}, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
}
# Poll HTTP until ready or timeout
wait_for_http_ready() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would poll local UI at $MORPHEUS_URL for readiness"
return 0
fi
local deadline=$(( $(date +%s) + UI_WAIT_TIMEOUT ))
local code tmp body mark
while (( $(date +%s) < deadline )); do
tmp="/tmp/morph_ui.$$"
code="$(curl -k -s -L -o "$tmp" -w '%{http_code}' "$MORPHEUS_URL/")" || code="000"
body="$(tr -d '\r' <"$tmp" | head -c 2000 || true)"; rm -f "$tmp" || true
mark="absent"
if grep -qi "Morpheus" <<<"$body"; then mark="present"; fi
if { [[ "$code" == "200" || "$code" == "302" ]] && [[ "$mark" == "present" ]]; }; then
log "Local UI ready, http_code=${code} marker=${mark}"
return 0
fi
log "Local UI not ready yet, http_code=${code} marker=${mark}, sleeping ${UI_WAIT_INTERVAL}s"
sleep "$UI_WAIT_INTERVAL"
done
return 1
}
# Cluster UI probe with redirects and optional domain
probe_ui_host() {
local host="$1" fqdn url code tmp="/tmp/morph_ui.$$" body mark="absent"
fqdn="$host"
if [[ -n "$UI_DOMAIN" && "$host" != *.* ]]; then
fqdn="${host}.${UI_DOMAIN}"
fi
url="https://${fqdn}/"
code="$(curl -k -s -L -m "$UI_PROBE_TIMEOUT" -o "$tmp" -w '%{http_code}' "$url")" || {
rm -f "$tmp"
log "[UI] ${fqdn} http_code=none marker=absent result=FAIL, curl error"
return 1
}
body="$(tr -d '\r' <"$tmp" | head -c 4000 || true)"; rm -f "$tmp"
if grep -qi "Morpheus" <<<"$body"; then mark="present"; fi
if [[ "$code" == "200" || "$code" == "302" ]] && [[ "$mark" == "present" ]]; then
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=OK"
return 0
fi
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=FAIL"
log "[UI] ${fqdn} body(head)=$(printf '%s' "$body" | tr -d '\n' | head -c 120)"
return 1
}
check_all_ui_backends_up() {
local peers="$1" ok=0 total=0 bad_list="" h short i
for h in $peers; do
short="${h%%.*}"; total=$((total+1))
log "[UI] probing ${short}"
for (( i=0; i<=UI_PROBE_RETRIES; i++ )); do
if probe_ui_host "$short"; then ok=$((ok+1)); break; fi
sleep 1
done
if (( i > UI_PROBE_RETRIES )); then bad_list+="$short "; fi
done
log "[Active post] UI backends summary, ok=${ok} total=${total} bad=${bad_list:-none}"
(( ok == total ))
}
rmq_cluster_json() {
[[ -x "$RABBITMQCTL_PATH" ]] || { log "rabbitmqctl not executable at $RABBITMQCTL_PATH"; return 1; }
bash -lc "[[ -f '$RABBIT_PROFILE' ]] && source '$RABBIT_PROFILE' || true; \
'$RABBITMQCTL_PATH' -q cluster_status --formatter json 2>/dev/null"
}
check_rabbitmq_dc() {
local expect_nodes="$1" json running partitions missing=""
json="$(rmq_cluster_json)" || { log "rabbitmq cluster_status failed"; return 1; }
if have jq; then
running="$(printf '%s' "$json" | jq -r '.running_nodes[]? // empty' 2>/dev/null | xargs echo)"
partitions="$(printf '%s' "$json" | jq -r '.partitions | length' 2>/dev/null)"
else
running="$(printf '%s' "$json" | sed -n 's/.*"running_nodes":[[]\([^]]]*\)[]].*/\1/p' | tr -d '"' | tr ',' ' ')"
partitions="$(printf '%s' "$json" | sed -n 's/.*"partitions":[[]\([^]]]*\)[]].*/\1/p' | wc -w | tr -d ' ')"
fi
log "RabbitMQ running nodes, $running"
[[ "$partitions" =~ ^[0-9]+$ ]] || partitions=0
if [[ "$partitions" -gt 0 ]]; then
log "RabbitMQ partitions detected, $partitions"; return 1;
fi
for n in $expect_nodes; do
grep -qw -- "$n" <<<"$running" || missing+="$n ";
done
[[ -z "$missing" ]] || { log "Missing expected RMQ nodes, $missing"; return 1; }
return 0
}
# Minimal addition, poll until at least 3 expected DC nodes are running and no partitions
wait_for_rabbit_three_in_dc() {
local expect_nodes="$1" deadline now json running partitions present=0 missing=""
deadline=$(( $(date +%s) + RABBIT_WAIT_TIMEOUT ))
while :; do
json="$(rmq_cluster_json)" || { log "rabbitmq cluster_status failed, will retry"; json=""; }
if [[ -n "$json" ]]; then
if have jq; then
running="$(printf '%s' "$json" | jq -r '.running_nodes[]? // empty' 2>/dev/null | xargs echo)"
partitions="$(printf '%s' "$json" | jq -r '.partitions | length' 2>/dev/null)"
else
running="$(printf '%s' "$json" | sed -n 's/.*"running_nodes":[[]\([^]]]*\)[]].*/\1/p' | tr -d '"' | tr ',' ' ')"
partitions="$(printf '%s' "$json" | sed -n 's/.*"partitions":[[]\([^]]]*\)[]].*/\1/p' | wc -w | tr -d ' ')"
fi
[[ "$partitions" =~ ^[0-9]+$ ]] || partitions=0
present=0
missing=""
for n in $expect_nodes; do
if grep -qw -- "$n" <<<"$running"; then
present=$((present+1))
else
missing+="$n "
fi
done
log "RMQ wait, present=$present needed=3 partitions=$partitions missing=${missing:-none}"
if (( present >= 3 )) && [[ "$partitions" -eq 0 ]]; then
log "RMQ cluster has at least 3 expected DC nodes and no partitions"
return 0
fi
fi
now=$(date +%s)
if (( now >= deadline )); then
log "RMQ cluster did not reach 3 DC nodes within ${RABBIT_WAIT_TIMEOUT}s, present=$present missing=${missing:-none} partitions=${partitions:-unknown}"
return 1
fi
sleep "$RABBIT_WAIT_INTERVAL"
done
}
check_elastic() {
local status
status="$(curl -fsS "$ES_URL" 2>/dev/null | sed -n 's/.*\"status\":\"\([^\"]*\)\".*/\1/p')" || true
[[ -n "$status" ]] || { log "Elasticsearch query failed at $ES_URL"; return 1; }
log "Elasticsearch status, $status"
[[ ",$ES_OK_STATUSES," == *",$status,"* ]]
}
wait_for_elastic_ready() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would wait for Elasticsearch readiness"
return 0
fi
local deadline=$(( $(date +%s) + ES_WAIT_TIMEOUT ))
while (( $(date +%s) < deadline )); do
if check_elastic; then
return 0
fi
log "Elasticsearch not ready yet, retrying in ${ES_WAIT_INTERVAL}s"
sleep "$ES_WAIT_INTERVAL"
done
log "Elasticsearch did not reach an OK status within ${ES_WAIT_TIMEOUT}s"
return 1
}
lock_owned_by_me(){
[[ -f "$LOCK_DIR_PATH/owner" ]] && grep -q "host=$(short_host)" "$LOCK_DIR_PATH/owner";
}
lock_release() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would release active patch lock at $LOCK_DIR_PATH"
return 0
fi
if lock_owned_by_me; then
rm -rf "$LOCK_DIR_PATH" || true
log "Released active patch lock"
else
log "Not releasing lock, not owned by this host"
fi
}
# ---------- main ----------
DC="$(detect_dc || true)" || { log "Cannot detect DC"; exit 20; }
case "$DC" in
dc1)
EXPECT_RMQ_NODES="${RMQ_NODES_DC1_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC1")}"
APP_PEERS="$APP_NODES_DC1"
;;
dc2)
EXPECT_RMQ_NODES="${RMQ_NODES_DC2_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC2")}"
APP_PEERS="$APP_NODES_DC2"
;;
*)
log "Unknown DC"; exit 21 ;;
esac
log "Detected DC, $DC"
log "Expected RMQ nodes, $EXPECT_RMQ_NODES"
log "APP_PEERS, $APP_PEERS"
ROLE="$(detect_role)"
log "Detected site role, $ROLE"
log "Initial $MORPHEUS_SVC status, $(svc_status_head)"
if [[ "$ROLE" == "active" ]]; then
if ! svc_running; then
log "Starting Morpheus UI on active node"
svc_start
else
log "Morpheus UI already running"
fi
if [[ "$TESTMODE" != "1" ]]; then
log "Waiting for $MORPHEUS_SVC to report running, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_service_running; then
log "Morpheus service did not report running within ${UI_WAIT_TIMEOUT}s"
exit 30
fi
log "Waiting for local UI TCP on $MORPHEUS_URL, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_tcp; then
log "Local UI TCP not reachable within ${UI_WAIT_TIMEOUT}s"
exit 31
fi
log "Waiting for local UI HTTP readiness on $MORPHEUS_URL, timeout ${UI_WAIT_TIMEOUT}s"
if ! wait_for_http_ready; then
log "Local UI did not reach HTTP readiness within ${UI_WAIT_TIMEOUT}s"
exit 32
fi
else
log "TESTMODE, would wait for service, TCP and HTTP readiness"
fi
log "[Active post] Checking that all UI backends are healthy"
check_all_ui_backends_up "$APP_PEERS" || { log "Not all UI backends are healthy after reboot"; exit 33; }
log "[Active post] RabbitMQ, waiting for 3 DC nodes"
wait_for_rabbit_three_in_dc "$EXPECT_RMQ_NODES" || exit 34
log "[Active post] RabbitMQ final check"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 34
log "[Active post] Elasticsearch, waiting for readiness"
wait_for_elastic_ready || exit 35
lock_release
log "Active POST checks passed, node healthy and lock released"
exit 0
else
if svc_running; then
log "Stopping Morpheus UI on passive node"
svc_stop
if ! wait_for_service_stopped; then
log "Morpheus UI did not stop within ${SVC_STOP_TIMEOUT}s on passive node"
log "Current $MORPHEUS_SVC status, $(svc_status_head)"
exit 40
fi
log "Confirmed Morpheus UI is stopped on passive node"
else
log "Morpheus UI already stopped on passive node"
fi
log "Passive $MORPHEUS_SVC status, $(svc_status_head)"
log "[Passive post] RabbitMQ, waiting for 3 DC nodes"
wait_for_rabbit_three_in_dc "$EXPECT_RMQ_NODES" || exit 41
log "[Passive post] RabbitMQ final check"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 41
log "[Passive post] Elasticsearch, waiting for readiness"
wait_for_elastic_ready || exit 42
log "Passive POST checks passed, node healthy"
exit 0
fi
#!/usr/bin/env bash
# PRE script for Morpheus application nodes
# Purpose, prove it is safe to patch this node, then stop morpheus-ui when appropriate
# Active site, jitter, require all UI backends up, acquire NFS lock, peer check, RMQ and ES checks, stop morpheus-ui
# Passive site, enforce morpheus-ui stopped, RMQ and ES checks
set -euo pipefail
# ---------- config ----------
MORPHEUS_SVC="morpheus-ui"
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}" # RW on active, RO or not mounted on passive
LOCK_DIR_NAME=".patch_active_lock"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# Logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/morpheus_app_patch.log}"
# UI health probes
UI_PRECHECK_JITTER_MIN="${UI_PRECHECK_JITTER_MIN:-2}" # seconds
UI_PRECHECK_JITTER_MAX="${UI_PRECHECK_JITTER_MAX:-10}" # seconds
UI_PROBE_TIMEOUT="${UI_PROBE_TIMEOUT:-5}" # seconds per host
UI_PROBE_RETRIES="${UI_PROBE_RETRIES:-2}" # retries per host
UI_DOMAIN="${UI_DOMAIN:-}" # if set, append to short host, example company.local
# Elasticsearch
ES_URL="${ES_URL:-http://127.0.0.1:9200/_cluster/health}"
ES_OK_STATUSES="${ES_OK_STATUSES:-green,yellow}"
# RabbitMQ
RABBITMQCTL_PATH="${RABBITMQCTL_PATH:-/opt/morpheus/embedded/bin/rabbitmqctl}"
RABBIT_PROFILE="${RABBIT_PROFILE:-/var/lib/rabbitmq/.profile}"
RBQ_NODE_PREFIX="${RBQ_NODE_PREFIX:-rabbit}" # set to rabbitmq if your cluster reports rabbitmq@host
RBQ_NAME_MODE="${RBQ_NAME_MODE:-short}" # short uses hostname -s, fqdn uses full name
# App nodes per DC, update to your real hosts or FQDNs
APP_NODES_DC1="${APP_NODES_DC1:-ukdc1-mph-01.example.com ukdc1-mph-02.example.com ukdc1-mph-03.example.com}"
APP_NODES_DC2="${APP_NODES_DC2:-ukdc2-mph-01.example.com ukdc2-mph-02.example.com ukdc2-mph-03.example.com}"
# Optional overrides if RabbitMQ runs only on a subset
RMQ_NODES_DC1_OVERRIDE="${RMQ_NODES_DC1_OVERRIDE:-}"
RMQ_NODES_DC2_OVERRIDE="${RMQ_NODES_DC2_OVERRIDE:-}"
# Optional override for site role, values, active, passive, auto
PATCH_ROLE="${PATCH_ROLE:-auto}"
# Test mode
TESTMODE="${TESTMODE:-0}"
# ---------- logging helpers ----------
mkdir -p "$LOG_DIR" 2>/dev/null || true
log(){ printf "%s %s\n" "$(date '+%F %T')" "$*" | tee -a "$LOG_FILE"; }
printf "\n#-------------------------------#\n" | tee -a "$LOG_FILE"
log "Starting $(basename "$0")"
if [[ "$TESTMODE" == "1" ]]; then
echo "########################################################" | tee -a "$LOG_FILE"
echo "### RUNNING IN TESTMODE ###" | tee -a "$LOG_FILE"
echo "### No changes will be made by this script now ###" | tee -a "$LOG_FILE"
echo "########################################################" | tee -a "$LOG_FILE"
log "TESTMODE is ON"
fi
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
have(){ command -v "$1" >/dev/null 2>&1; }
detect_dc() {
local h; h="$(short_host)"
case "$h" in
ukdc1*|*dc1*|*DC1*) echo "dc1"; return 0 ;;
ukdc2*|*dc2*|*DC2*) echo "dc2"; return 0 ;;
esac
ip -o -4 addr show | grep -qE '10\.10\.' && { echo "dc1"; return 0; }
ip -o -4 addr show | grep -qE '10\.11\.' && { echo "dc2"; return 0; }
echo ""; return 1
}
detect_role() {
# active if NFS is writable
if ! mountpoint -q "$NFS_PATH"; then echo "passive"; return 0; fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then rm -f "$tf" || true; echo "active"; else echo "passive"; fi
}
make_rmq_nodes() {
local list="$1" out="" h
for h in $list; do
if [[ "$RBQ_NAME_MODE" = "fqdn" ]]; then out+="${RBQ_NODE_PREFIX}@${h} "
else out+="${RBQ_NODE_PREFIX}@${h%%.*} "
fi
done
echo "$out"
}
svc_running() {
if command -v morpheusctl >/dev/null 2>&1; then
morpheusctl status "$MORPHEUS_SVC" 2>/dev/null | grep -qi "running"
elif command -v morpheus-ctl >/dev/null 2>&1; then
morpheus-ctl status "$MORPHEUS_SVC" 2>/dev/null | grep -qi "run"
else
systemctl is-active --quiet "$MORPHEUS_SVC"
fi
}
svc_stop() {
if [[ "$TESTMODE" == "1" ]]; then log "TESTMODE, would stop $MORPHEUS_SVC"; return 0; fi
if command -v morpheusctl >/dev/null 2>&1; then
morpheusctl stop "$MORPHEUS_SVC"
elif command -v morpheus-ctl >/dev/null 2>&1; then
morpheus-ctl stop "$MORPHEUS_SVC"
else
systemctl stop "$MORPHEUS_SVC"
fi
}
peer_tcp_reachable() {
local peers="$1" me="$(short_host)" p short
for p in $peers; do
short="${p%%.*}"; [[ "$short" == "$me" ]] && continue
if timeout 2 bash -lc ">/dev/tcp/$short/443" 2>/dev/null || timeout 2 bash -lc ">/dev/tcp/$short/80" 2>/dev/null; then
return 0
fi
done
return 1
}
rand_sleep_between() {
local min="$1" max="$2" span secs
(( max <= min )) && { sleep "$min"; return; }
span=$(( max - min + 1 ))
secs=$(( min + RANDOM % span ))
log "Jitter sleep ${secs}s before UI all backends precheck"
sleep "$secs"
}
# -------------- updated UI probe with redirects and optional domain --------------
probe_ui_host() {
# Probe one UI host. Follow redirects, accept 200 or 302, require marker present in final body.
local host="$1" fqdn url code tmp="/tmp/morph_ui.$$" body mark="absent"
fqdn="$host"
if [[ -n "$UI_DOMAIN" && "$host" != *.* ]]; then
fqdn="${host}.${UI_DOMAIN}"
fi
url="https://${fqdn}/"
code="$(curl -k -s -L -m "$UI_PROBE_TIMEOUT" -o "$tmp" -w '%{http_code}' "$url")" || {
rm -f "$tmp"
log "[UI] ${fqdn} http_code=none marker=absent result=FAIL, curl error"
return 1
}
body="$(tr -d '\r' <"$tmp" | head -c 4000 || true)"
rm -f "$tmp"
if grep -qi "Morpheus" <<<"$body"; then mark="present"; fi
if [[ "$code" == "200" || "$code" == "302" ]] && [[ "$mark" == "present" ]]; then
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=OK"
return 0
fi
log "[UI] ${fqdn} http_code=${code} marker=${mark} result=FAIL"
log "[UI] ${fqdn} body(head)=$(printf '%s' "$body" | tr -d '\n' | head -c 120)"
return 1
}
check_all_ui_backends_up() {
# Require all peers in APP_PEERS to answer OK
local peers="$1" ok=0 total=0 bad_list="" h short i
for h in $peers; do
short="${h%%.*}"; total=$((total+1))
log "[UI] probing ${short}"
for (( i=0; i<=UI_PROBE_RETRIES; i++ )); do
if probe_ui_host "$short"; then ok=$((ok+1)); break; fi
sleep 1
done
if (( i > UI_PROBE_RETRIES )); then bad_list+="$short "; fi
done
log "[Active pre] UI backends summary, ok=${ok} total=${total} bad=${bad_list:-none}"
(( ok == total ))
}
rmq_cluster_json() {
[[ -x "$RABBITMQCTL_PATH" ]] || { log "rabbitmqctl not executable at $RABBITMQCTL_PATH"; return 1; }
bash -lc "[[ -f '$RABBIT_PROFILE' ]] && source '$RABBIT_PROFILE' || true; '$RABBITMQCTL_PATH' -q cluster_status --formatter json"
}
check_rabbitmq_dc() {
local expect_nodes="$1" json running partitions missing=""
json="$(rmq_cluster_json)" || { log "rabbitmq cluster_status failed"; return 1; }
if have jq; then
running="$(printf '%s' "$json" | jq -r '.running_nodes[]? // empty' | xargs echo)"
partitions="$(printf '%s' "$json" | jq -r '.partitions | length')"
else
running="$(printf '%s' "$json" | sed -n 's/.*"running_nodes":[[]\([^]]]*\)[]].*/\1/p' | tr -d '"' | tr ',' ' ')"
partitions="$(printf '%s' "$json" | sed -n 's/.*"partitions":[[]\([^]]]*\)[]].*/\1/p' | wc -w | tr -d ' ')"
fi
log "RabbitMQ running nodes, $running"
[[ "$partitions" =~ ^[0-9]+$ ]] || partitions=0
if [[ "$partitions" -gt 0 ]]; then log "RabbitMQ partitions detected, $partitions"; return 1; fi
for n in $expect_nodes; do grep -qw -- "$n" <<<"$running" || missing+="$n "; done
[[ -z "$missing" ]] || { log "Missing expected RMQ nodes, $missing"; return 1; }
return 0
}
check_elastic() {
local status
status="$(curl -fsS "$ES_URL" | sed -n 's/.*"status":"\([^"]*\)".*/\1/p')" || true
[[ -n "$status" ]] || { log "Elasticsearch query failed at $ES_URL"; return 1; }
log "Elasticsearch status, $status"
[[ ",$ES_OK_STATUSES," == *",$status,"* ]]
}
lock_acquire() {
if [[ "$TESTMODE" == "1" ]]; then log "TESTMODE, would acquire active patch lock at $LOCK_DIR_PATH"; return 0; fi
if mkdir "$LOCK_DIR_PATH" 2>/dev/null; then
echo "host=$(short_host) time=$(date '+%F %T')" > "$LOCK_DIR_PATH/owner"
sync || true
log "Acquired active patch lock at $LOCK_DIR_PATH"
return 0
fi
log "Active patch lock already held, details, $(cat "$LOCK_DIR_PATH/owner" 2>/dev/null || echo unknown)"
return 1
}
# ---------- main ----------
DC="$(detect_dc || true)" || { log "Cannot detect DC"; exit 20; }
case "$DC" in
dc1) EXPECT_RMQ_NODES="${RMQ_NODES_DC1_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC1")}" ; APP_PEERS="$APP_NODES_DC1" ;;
dc2) EXPECT_RMQ_NODES="${RMQ_NODES_DC2_OVERRIDE:-$(make_rmq_nodes "$APP_NODES_DC2")}" ; APP_PEERS="$APP_NODES_DC2" ;;
*) log "Unknown DC"; exit 21 ;;
esac
log "Detected DC, $DC"
log "Expected RMQ nodes, $EXPECT_RMQ_NODES"
log "APP_PEERS, $APP_PEERS"
ROLE_DETECTED="$(detect_role)"
ROLE="$ROLE_DETECTED"
if [[ "${PATCH_ROLE}" != "auto" ]]; then ROLE="${PATCH_ROLE}"; fi
log "Detected site role, $ROLE_DETECTED, using $ROLE"
if [[ "$ROLE" == "active" ]]; then
rand_sleep_between "$UI_PRECHECK_JITTER_MIN" "$UI_PRECHECK_JITTER_MAX"
log "[Active pre] Checking that all UI backends are healthy"
check_all_ui_backends_up "$APP_PEERS" || { log "Not all UI backends are healthy"; exit 5; }
log "[Active pre] Attempting to acquire shared lock at $LOCK_DIR_PATH"
lock_acquire || { log "Could not acquire active patch lock"; exit 6; }
log "[Active pre] Checking peer availability before stopping UI"
peer_tcp_reachable "$APP_PEERS" || { log "No peer UI reachable"; exit 7; }
log "[Active pre] RabbitMQ"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 8
log "[Active pre] Elasticsearch"
check_elastic || exit 9
log "[Active pre] Stopping Morpheus UI on active node"
if svc_running; then svc_stop; sleep 2; fi
if [[ "$TESTMODE" != "1" ]] && svc_running; then log "Morpheus UI still running after stop"; exit 10; fi
log "Active PRE checks passed, node is safe to patch"
exit 0
else
if svc_running; then
log "Morpheus UI is running on passive, stopping it"
svc_stop; sleep 2
fi
if [[ "$TESTMODE" != "1" ]] && svc_running; then log "Morpheus UI still running on passive after stop"; exit 40; fi
log "[Passive pre] RabbitMQ"
check_rabbitmq_dc "$EXPECT_RMQ_NODES" || exit 41
log "[Passive pre] Elasticsearch"
check_elastic || exit 42
log "Passive PRE checks passed, node is safe to patch"
exit 0
fi
sudo env -i PATH=/usr/sbin:/usr/bin:/sbin:/bin \
HOME=/root TERM=xterm LANG=C \
/bin/bash -x /usr/local/sbin/morph-app-pre.sh 2>&1 | tee /root/morph-pre.debug.log
echo "rc=$?"
wait /bin/bash -lc "/usr/local/sbin/morph-app-pre.sh 2>&1 | tee -a /var/log/morph-pre.bigfix.log ; exit ${PIPESTATUS[0]}"
mount | grep -E "/var/opt/morpheus/morpheus-ui"
ls -ld /var/opt/morpheus/morpheus-ui
# consistent PATH for non-login shells
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
Harden the script so it behaves under BigFix
Add these at the very top, right after set -euo pipefail.
# optional debug
DEBUG="${DEBUG:-0}"
if [[ "$DEBUG" == "1" ]]; then
set -x
exec > >(tee -a "$LOG_FILE") 2>&1
fi
Use absolute paths for the tools that often are not in PATH for non-login root:
HOSTNAME_BIN="/usr/bin/hostname"
MOUNTPOINT_BIN="/usr/bin/mountpoint"
IP_BIN="/usr/sbin/ip"
CURL_BIN="/usr/bin/curl"
SYSTEMCTL_BIN="/usr/bin/systemctl"
GREP_BIN="/usr/bin/grep"
SED_BIN="/usr/bin/sed"
TEE_BIN="/usr/bin/tee"
DATE_BIN="/usr/bin/date"
Then update helpers to reference them. For example:
short_host(){ "$HOSTNAME_BIN" -s 2>/dev/null || "$HOSTNAME_BIN"; }
detect_role() {
if ! "$MOUNTPOINT_BIN" -q "$NFS_PATH"; then echo "passive"; return 0; fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then rm -f "$tf" || true; echo "active"; else echo "passive"; fi
}
Drop this at the top of the script to make it resilient without touching every call.
set -euo pipefail
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
DEBUG="${DEBUG:-0}"
if [[ "$DEBUG" == "1" ]]; then set -x; fi
which_or_fail() { command -v "$1" >/dev/null 2>&1 || { echo "missing binary, $1" >&2; exit 1; }; }
for bin in hostname mountpoint ip curl systemctl grep sed tee date; do which_or_fail "$bin"; done
Then run under BigFix with:
wait /bin/bash -lc "DEBUG=1 /usr/local/sbin/morph-app-pre.sh 2>&1 | /usr/bin/tee -a /var/log/morph-pre.bigfix.log ; exit ${PIPESTATUS[0]}"
PRE-DB Script:
#!/usr/bin/env bash
# PRE script for Morpheus DB nodes
# Safely prove it is OK to patch this node, then stop MySQL
# Features, strict mode, fixed PATH, binary checks, detailed logging, TESTMODE, NFS lock on active site, cluster health checks, primary migration if needed, randomized backoff and final recheck
set -euo pipefail
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
# ---------- required binaries ----------
which_or_fail(){ command -v "$1" >/dev/null 2>&1 || { echo "missing binary, $1" >&2; exit 1; }; }
for bin in hostname mountpoint ip systemctl grep sed tee date awk tail mysqlsh id whoami tty ps; do which_or_fail "$bin"; done
# ---------- config ----------
MYSQL_SVC="${MYSQL_SVC:-mysql}"
# NFS lock on active site only
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME="${LOCK_DIR_NAME:-.db_patch_active_lock}"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# cluster expectations and timings
REQUIRED_COUNT="${REQUIRED_COUNT:-3}"
JITTER_MIN="${JITTER_MIN:-2}"
JITTER_MAX="${JITTER_MAX:-10}"
# optional override, values active, passive, auto
PATCH_ROLE="${PATCH_ROLE:-auto}"
# dry run and debug
TESTMODE="${TESTMODE:-0}"
DEBUG="${DEBUG:-0}"
# logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/mysqldb_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
# enable xtrace if requested, with timestamped PS4
if [[ "$DEBUG" == "1" ]]; then
export PS4='+ $(date "+%F %T") ${BASH_SOURCE##*/}:${LINENO}: '
set -x
fi
# send all output to console and log
exec > >(tee -a "$LOG_FILE") 2>&1
log(){ printf "%s %s\n" "$(date '+%F %T')" "$*"; }
trap 'rc=$?; log "[SUMMARY] PRE exiting rc=$rc TESTMODE=$TESTMODE"; exit $rc' EXIT
# identity capture with absolute paths and fallbacks
UID_NUM="$(/usr/bin/id -u 2>/dev/null || echo NA)"
RUN_AS="$([ -x /usr/bin/whoami ] && /usr/bin/whoami 2>/dev/null || echo NA)"
TTY_DEV="$(/usr/bin/tty 2>/dev/null || echo NA)"
PPID_NUM="${PPID:-NA}"
PPROC_NAME="$(/usr/bin/ps -o comm= -p "${PPID_NUM}" 2>/dev/null || echo NA)"
log "Starting $(basename "$0") uid=${UID_NUM} user=${RUN_AS} tty=${TTY_DEV} ppid=${PPID_NUM} pproc=${PPROC_NAME}"
log "PATH=$PATH"
log "LOG_FILE=$LOG_FILE"
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
# safer helper that avoids nested quotes by taking query via stdin
sql_last() {
local q="$1"
mysqlsh --sql -e "$q" | tail -n1
}
role_reason="unknown"
detect_role() {
if ! mountpoint -q "$NFS_PATH"; then
role_reason="nfs_not_mounted"
echo "passive"
return 0
fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then
rm -f "$tf" || true
role_reason="nfs_rw_ok"
echo "active"
else
role_reason="nfs_ro_or_perm_denied"
echo "passive"
fi
}
lock_acquire() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would acquire DB patch lock at $LOCK_DIR_PATH"
return 0
fi
if mkdir "$LOCK_DIR_PATH" 2>/dev/null; then
printf "host=%s time=%s pid=%s\n" "$(short_host)" "$(date '+%F %T')" "$$" > "$LOCK_DIR_PATH/owner" || true
sync || true
log "Acquired DB patch lock at $LOCK_DIR_PATH"
return 0
fi
local owner="unknown"
[[ -f "$LOCK_DIR_PATH/owner" ]] && owner="$(cat "$LOCK_DIR_PATH/owner" 2>/dev/null || true)"
log "DB patch lock already held at $LOCK_DIR_PATH owner $owner"
return 1
}
rand_sleep_between() {
local min="$1" max="$2" span secs
if (( max <= min )); then
secs="$min"
else
span=$(( max - min + 1 ))
secs=$(( min + RANDOM % span ))
fi
log "Randomized backoff before stop, sleeping ${secs}s"
sleep "$secs"
}
require_cluster_ok(){
sql_last "SELECT CASE WHEN COUNT(*)=${REQUIRED_COUNT} AND SUM(MEMBER_STATE='ONLINE')=${REQUIRED_COUNT} THEN 'OK' ELSE 'NOT_OK' END FROM performance_schema.replication_group_members;"
}
# ---------- main flow ----------
# 1, classify site role and acquire lock if active
ROLE_DETECTED="$(detect_role)"
ROLE="$ROLE_DETECTED"
if [[ "$PATCH_ROLE" != "auto" ]]; then ROLE="$PATCH_ROLE"; fi
log "Detected site role ${ROLE_DETECTED} reason ${role_reason} using ${ROLE}"
if [[ "$ROLE" == "active" ]]; then
log "Attempting to acquire active site lock"
lock_acquire || { log "Failed to acquire active site lock"; exit 95; }
else
log "Passive site, skipping lock"
fi
# 2, initial cluster health gates
log "Checking cluster health gates"
status="$(require_cluster_ok || true)"
count="$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;" || echo 0)"
bad_count="$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE!='ONLINE'),0) FROM performance_schema.replication_group_members;" || echo 1)"
log "Cluster snapshot status=${status} count=${count} bad_count=${bad_count}"
if [[ "$status" != "OK" ]]; then log "Cluster not healthy status=${status}"; exit 2; fi
if [[ "$count" != "$REQUIRED_COUNT" ]]; then log "Unexpected member count ${count} expected ${REQUIRED_COUNT}"; exit 3; fi
if [[ "$bad_count" != "0" ]]; then log "Some members not ONLINE bad_count=${bad_count}"; exit 4; fi
# 3, if this node is primary, promote a secondary
this_is_primary="$(sql_last "SELECT CASE WHEN EXISTS(SELECT 1 FROM performance_schema.replication_group_members WHERE MEMBER_ID = @@server_uuid AND MEMBER_ROLE='PRIMARY') THEN 1 ELSE 0 END;" || echo 0)"
if [[ "$this_is_primary" == "1" ]]; then
# pick a secondary target
target="$(mysqlsh --sql -e "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE!='PRIMARY' ORDER BY MEMBER_HOST LIMIT 1;" | tail -n1 || true)"
if [[ -z "$target" ]]; then
log "No secondary found to promote"
exit 5
fi
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would promote ${target} to PRIMARY"
else
log "Promoting ${target} to PRIMARY"
# single quotes around JS string to avoid nested double quote issues
mysqlsh --js -e 'var c = dba.getCluster();' \
--js -e "c.setPrimaryInstance('${target}')" || { log "setPrimaryInstance call failed"; exit 6; }
fi
newp="$(sql_last "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE='PRIMARY';" || echo "")"
if [[ "$newp" != "$target" ]]; then
log "Primary promotion verification failed expected ${target} got ${newp:-empty}"
exit 6
fi
log "Primary is now ${newp}"
else
log "This node is not PRIMARY, continuing"
fi
# 4, refuse to proceed if MySQL already stopped
if ! systemctl is-active --quiet "$MYSQL_SVC"; then
log "MySQL service ${MYSQL_SVC} is not active on this node, refusing to proceed"
exit 7
fi
# 5, final race protection, jitter then recheck health
rand_sleep_between "$JITTER_MIN" "$JITTER_MAX"
final_status="$(require_cluster_ok || true)"
if [[ "$final_status" != "OK" ]]; then
log "Cluster changed during pre, final_status=${final_status}, aborting to avoid overlap"
exit 20
fi
log "Final health check OK, proceeding to stop MySQL"
# 6, stop MySQL or simulate
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would stop service ${MYSQL_SVC}"
else
log "Stopping service ${MYSQL_SVC}"
systemctl stop "$MYSQL_SVC"
fi
log "PRE checks passed, node is safe to patch"
exit 0
pre-DB script v2
#!/usr/bin/env bash
# PRE script for Morpheus DB nodes
# Purpose, prove it is safe to patch this node, then stop MySQL safely
# Features, strict mode, fixed PATH, binary checks, detailed logging, TESTMODE,
# active site NFS lock, cluster health checks, primary migration if needed,
# randomized backoff and final recheck
set -euo pipefail
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
# ---------- required binaries ----------
which_or_fail(){ command -v "$1" >/dev/null 2>&1 || { echo "missing binary, $1" >&2; exit 1; }; }
for bin in hostname mountpoint ip systemctl grep sed tee date awk tail mysqlsh id whoami tty ps; do which_or_fail "$bin"; done
# ---------- config ----------
MYSQL_SVC="${MYSQL_SVC:-mysql}"
# NFS lock on active site only
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME="${LOCK_DIR_NAME:-.db_patch_active_lock}"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# cluster expectations and timings
REQUIRED_COUNT="${REQUIRED_COUNT:-3}"
JITTER_MIN="${JITTER_MIN:-2}"
JITTER_MAX="${JITTER_MAX:-10}"
# optional override, values active, passive, auto
PATCH_ROLE="${PATCH_ROLE:-auto}"
# dry run and debug
TESTMODE="${TESTMODE:-0}"
DEBUG="${DEBUG:-0}"
# logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/mysqldb_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
# enable xtrace if requested, with timestamped PS4
if [[ "$DEBUG" == "1" ]]; then
export PS4='+ $(date "+%F %T") ${BASH_SOURCE##*/}:${LINENO}: '
set -x
fi
# send all output to console and log
exec > >(tee -a "$LOG_FILE") 2>&1
log(){ printf "%s %s\n" "$(date '+%F %T')" "$*"; }
trap 'rc=$?; log "[SUMMARY] PRE exiting rc=$rc TESTMODE=$TESTMODE"; exit $rc' EXIT
# identity capture with absolute paths and fallbacks
UID_NUM="$(/usr/bin/id -u 2>/dev/null || echo NA)"
RUN_AS="$([ -x /usr/bin/whoami ] && /usr/bin/whoami 2>/dev/null || echo NA)"
TTY_DEV="$(/usr/bin/tty 2>/dev/null || echo NA)"
PPID_NUM="${PPID:-NA}"
PPROC_NAME="$(/usr/bin/ps -o comm= -p "${PPID_NUM}" 2>/dev/null || echo NA)"
log "Starting $(basename "$0") uid=${UID_NUM} user=${RUN_AS} tty=${TTY_DEV} ppid=${PPID_NUM} pproc=${PPROC_NAME}"
log "PATH=$PATH"
log "LOG_FILE=$LOG_FILE"
# ---------- HOME and mysqlsh connection bootstrap ----------
# ensure HOME so mysqlsh can read ~/.my.cnf
if [[ -z "${HOME:-}" || "$HOME" = "/" ]]; then
export HOME=/root
fi
log "HOME=$HOME"
if [[ -r "$HOME/.my.cnf" ]]; then
size="$(stat -c %s "$HOME/.my.cnf" 2>/dev/null || echo '?')"
log "~/.my.cnf found, size=${size} bytes"
else
log "WARNING, $HOME/.my.cnf not found or not readable, mysqlsh may not auto connect"
fi
MYSQL_PORT="${MYSQL_PORT:-3306}"
mysqlsh_probe() {
# try default option file connection
if mysqlsh --sql -e "SELECT 1;" >/dev/null 2>&1; then return 0; fi
# try explicit URI if provided
if [[ -n "${MYSQL_URI:-}" ]]; then
log "mysqlsh default connect failed, trying MYSQL_URI"
if mysqlsh --sql --uri "$MYSQL_URI" -e "SELECT 1;" >/dev/null 2>&1; then return 0; fi
fi
# try explicit localhost as last resort
log "mysqlsh still not connected, trying explicit localhost ${MYSQL_PORT}"
if mysqlsh --sql --host=127.0.0.1 --port="$MYSQL_PORT" -e "SELECT 1;" >/dev/null 2>&1; then return 0; fi
return 1
}
if ! mysqlsh_probe; then
log "ERROR, mysqlsh could not connect, check HOME and ~/.my.cnf or set MYSQL_URI"
exit 91
fi
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
sql_last() {
local q="$1"
mysqlsh --sql -e "$q" | tail -n1
}
role_reason="unknown"
detect_role() {
if ! mountpoint -q "$NFS_PATH"; then
role_reason="nfs_not_mounted"
echo "passive"
return 0
fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then
rm -f "$tf" || true
role_reason="nfs_rw_ok"
echo "active"
else
role_reason="nfs_ro_or_perm_denied"
echo "passive"
fi
}
lock_acquire() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would acquire DB patch lock at $LOCK_DIR_PATH"
return 0
fi
if mkdir "$LOCK_DIR_PATH" 2>/dev/null; then
printf "host=%s time=%s pid=%s\n" "$(short_host)" "$(date '+%F %T')" "$$" > "$LOCK_DIR_PATH/owner" || true
sync || true
log "Acquired DB patch lock at $LOCK_DIR_PATH"
return 0
fi
local owner="unknown"
[[ -f "$LOCK_DIR_PATH/owner" ]] && owner="$(cat "$LOCK_DIR_PATH/owner" 2>/dev/null || true)"
log "DB patch lock already held at $LOCK_DIR_PATH owner $owner"
return 1
}
rand_sleep_between() {
local min="$1" max="$2" span secs
if (( max <= min )); then
secs="$min"
else
span=$(( max - min + 1 ))
secs=$(( min + RANDOM % span ))
fi
log "Randomized backoff before stop, sleeping ${secs}s"
sleep "$secs"
}
require_cluster_ok(){
sql_last "SELECT CASE WHEN COUNT(*)=${REQUIRED_COUNT} AND SUM(MEMBER_STATE='ONLINE')=${REQUIRED_COUNT} THEN 'OK' ELSE 'NOT_OK' END FROM performance_schema.replication_group_members;"
}
# ---------- main flow ----------
# 1, classify site role and acquire lock if active
ROLE_DETECTED="$(detect_role)"
ROLE="$ROLE_DETECTED"
if [[ "$PATCH_ROLE" != "auto" ]]; then ROLE="$PATCH_ROLE"; fi
log "Detected site role ${ROLE_DETECTED} reason ${role_reason} using ${ROLE}"
if [[ "$ROLE" == "active" ]]; then
log "Attempting to acquire active site lock"
lock_acquire || { log "Failed to acquire active site lock"; exit 95; }
else
log "Passive site, skipping lock"
fi
# 2, initial cluster health gates
log "Checking cluster health gates"
status="$(require_cluster_ok || true)"
count="$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;" || echo 0)"
bad_count="$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE!='ONLINE'),0) FROM performance_schema.replication_group_members;" || echo 1)"
log "Cluster snapshot status=${status} count=${count} bad_count=${bad_count}"
if [[ "$status" != "OK" ]]; then log "Cluster not healthy status=${status}"; exit 2; fi
if [[ "$count" != "$REQUIRED_COUNT" ]]; then log "Unexpected member count ${count} expected ${REQUIRED_COUNT}"; exit 3; fi
if [[ "$bad_count" != "0" ]]; then log "Some members not ONLINE bad_count=${bad_count}"; exit 4; fi
# 3, if this node is primary, promote a secondary
this_is_primary="$(sql_last "SELECT CASE WHEN EXISTS(SELECT 1 FROM performance_schema.replication_group_members WHERE MEMBER_ID = @@server_uuid AND MEMBER_ROLE='PRIMARY') THEN 1 ELSE 0 END;" || echo 0)"
if [[ "$this_is_primary" == "1" ]]; then
target="$(mysqlsh --sql -e "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE!='PRIMARY' ORDER BY MEMBER_HOST LIMIT 1;" | tail -n1 || true)"
if [[ -z "$target" ]]; then
log "No secondary found to promote"
exit 5
fi
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would promote ${target} to PRIMARY"
else
log "Promoting ${target} to PRIMARY"
mysqlsh --js -e 'var c = dba.getCluster();' \
--js -e "c.setPrimaryInstance('${target}')" || { log "setPrimaryInstance call failed"; exit 6; }
fi
newp="$(sql_last "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE='PRIMARY';" || echo "")"
if [[ "$newp" != "$target" ]]; then
log "Primary promotion verification failed expected ${target} got ${newp:-empty}"
exit 6
fi
log "Primary is now ${newp}"
else
log "This node is not PRIMARY, continuing"
fi
# 4, refuse to proceed if MySQL already stopped
if ! systemctl is-active --quiet "$MYSQL_SVC"; then
log "MySQL service ${MYSQL_SVC} is not active on this node, refusing to proceed"
exit 7
fi
# 5, final race protection, jitter then recheck health
rand_sleep_between "$JITTER_MIN" "$JITTER_MAX"
final_status="$(require_cluster_ok || true)"
if [[ "$final_status" != "OK" ]]; then
log "Cluster changed during pre, final_status=${final_status}, aborting to avoid overlap"
exit 20
fi
log "Final health check OK, proceeding to stop MySQL"
# 6, stop MySQL or simulate
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would stop service ${MYSQL_SVC}"
else
log "Stopping service ${MYSQL_SVC}"
systemctl stop "$MYSQL_SVC"
fi
log "PRE checks passed, node is safe to patch"
exit 0
exec > >(tee -a "$LOG_FILE") 2>&1; printf '############################## %s ##############################\n' "$(date '+%F %T')" | tee -a "$LOG_FILE"
POST:
#!/usr/bin/env bash
# POST script for Morpheus DB nodes
# Purpose, after reboot ensure MySQL is up, this node rejoins ONLINE, whole cluster is healthy,
# recovery is complete, then release the NFS lock on the active site if this host owns it.
# Features, strict mode, fixed PATH, binary checks, detailed logging, TESTMODE, DEBUG,
# HOME bootstrap for mysqlsh, MYSQL_URI optional, safe lock release.
set -euo pipefail
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
# ---------- required binaries ----------
which_or_fail(){ command -v "$1" >/dev/null 2>&1 || { echo "missing binary, $1" >&2; exit 1; }; }
for bin in hostname mountpoint ip systemctl grep sed tee date awk tail mysqlsh id whoami tty ps stat sleep; do which_or_fail "$bin"; done
# ---------- config ----------
MYSQL_SVC="${MYSQL_SVC:-mysql}"
# NFS lock on active site only, must match PRE
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME="${LOCK_DIR_NAME:-.db_patch_active_lock}"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# cluster expectations and timings
REQUIRED_COUNT="${REQUIRED_COUNT:-3}"
WAIT_TIMEOUT="${WAIT_TIMEOUT:-900}" # total seconds to wait for each phase
WAIT_INTERVAL="${WAIT_INTERVAL:-10}" # poll interval seconds
# optional override, values active, passive, auto
PATCH_ROLE="${PATCH_ROLE:-auto}"
# dry run and debug
TESTMODE="${TESTMODE:-0}"
DEBUG="${DEBUG:-0}"
# logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/mysqldb_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
if [[ "$DEBUG" == "1" ]]; then
export PS4='+ $(date "+%F %T") ${BASH_SOURCE##*/}:${LINENO}: '
set -x
fi
# send all output to console and log, then print a separator banner
exec > >(tee -a "$LOG_FILE") 2>&1; printf '############################## %s ##############################\n' "$(date '+%F %T')" | tee -a "$LOG_FILE"
log(){ printf "%s %s\n" "$(date '+%F %T')" "$*"; }
trap 'rc=$?; log "[SUMMARY] POST exiting rc=$rc TESTMODE=$TESTMODE"; exit $rc' EXIT
# identity capture with absolute paths and fallbacks
UID_NUM="$(/usr/bin/id -u 2>/dev/null || echo NA)"
RUN_AS="$([ -x /usr/bin/whoami ] && /usr/bin/whoami 2>/dev/null || echo NA)"
TTY_DEV="$(/usr/bin/tty 2>/dev/null || echo NA)"
PPID_NUM="${PPID:-NA}"
PPROC_NAME="$(/usr/bin/ps -o comm= -p "${PPID_NUM}" 2>/dev/null || echo NA)"
log "Starting $(basename "$0") uid=${UID_NUM} user=${RUN_AS} tty=${TTY_DEV} ppid=${PPID_NUM} pproc=${PPROC_NAME}"
log "PATH=$PATH"
log "LOG_FILE=$LOG_FILE"
# ---------- HOME and mysqlsh connection bootstrap ----------
if [[ -z "${HOME:-}" || "$HOME" = "/" ]]; then
export HOME=/root
fi
log "HOME=$HOME"
if [[ -r "$HOME/.my.cnf" ]]; then
size="$(stat -c %s "$HOME/.my.cnf" 2>/dev/null || echo '?')"
log "~/.my.cnf found, size=${size} bytes"
else
log "WARNING, $HOME/.my.cnf not found or not readable, mysqlsh may not auto connect"
fi
log "mysqlsh version: $(mysqlsh --version 2>&1)"
log "mysqlsh path: $(command -v mysqlsh 2>/dev/null || echo 'not found')"
MYSQL_PORT="${MYSQL_PORT:-3306}"
# If MYSQL_URI provided, prefer it
mysqlsh_try(){
if [[ -n "${MYSQL_URI:-}" ]]; then
mysqlsh --sql --uri "$MYSQL_URI" -e "SELECT 1;" >/dev/null 2>&1 && return 0
fi
mysqlsh --sql -e "SELECT 1;" >/dev/null 2>&1 && return 0
mysqlsh --sql --host=127.0.0.1 --port="$MYSQL_PORT" -e "SELECT 1;" >/dev/null 2>&1 && return 0
return 1
}
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
sql_last() {
local q="$1"
if [[ -n "${MYSQL_URI:-}" ]]; then
mysqlsh --sql --uri "$MYSQL_URI" -e "$q" | tail -n1
else
mysqlsh --sql -e "$q" | tail -n1
fi
}
role_reason="unknown"
detect_role() {
if ! mountpoint -q "$NFS_PATH"; then
role_reason="nfs_not_mounted"
echo "passive"
return 0
fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then
rm -f "$tf" || true
role_reason="nfs_rw_ok"
echo "active"
else
role_reason="nfs_ro_or_perm_denied"
echo "passive"
fi
}
lock_owned_by_me(){ [[ -f "$LOCK_DIR_PATH/owner" ]] && grep -q "host=$(short_host)" "$LOCK_DIR_PATH/owner"; }
lock_release() {
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would release DB patch lock at $LOCK_DIR_PATH"
return 0
fi
if lock_owned_by_me; then
rm -rf "$LOCK_DIR_PATH" || true
log "Released DB patch lock"
else
log "No lock owned by this host to release, skipping"
fi
}
wait_until() {
# wait_until "label" timeout_seconds interval_seconds cmd...
local label="$1" timeout="$2" interval="$3"; shift 3
local deadline=$(( $(date +%s) + timeout ))
local now
while :; do
if "$@"; then return 0; fi
now=$(date +%s); if (( now >= deadline )); then return 1; fi
log "Waiting for ${label}, sleeping ${interval}s"
sleep "$interval"
done
}
# ---------- main flow ----------
# 0, ensure MySQL service is running
if ! systemctl is-active --quiet "$MYSQL_SVC"; then
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would start $MYSQL_SVC"
else
log "Starting $MYSQL_SVC"
systemctl start "$MYSQL_SVC"
fi
else
log "Service $MYSQL_SVC already active"
fi
# 1, wait for SQL connectivity
if wait_until "SQL connectivity" "$WAIT_TIMEOUT" "$WAIT_INTERVAL" mysqlsh_try; then
log "SQL connectivity OK"
else
log "SQL did not become reachable within ${WAIT_TIMEOUT}s"
exit 30
fi
# 2, classify site role for later lock release
ROLE_DETECTED="$(detect_role)"
ROLE="$ROLE_DETECTED"
if [[ "$PATCH_ROLE" != "auto" ]]; then ROLE="$PATCH_ROLE"; fi
log "Detected site role ${ROLE_DETECTED} reason ${role_reason} using ${ROLE}"
# 3, wait for this node to report ONLINE
local_state_cmd=()
local_state_cmd+=( bash -lc )
local_state_cmd+=( "state=\$( $(command -v mysqlsh) --sql ${MYSQL_URI:+--uri \"$MYSQL_URI\"} -e \"SELECT COALESCE(MAX(CASE WHEN MEMBER_ID=@@server_uuid THEN MEMBER_STATE END),'MISSING') FROM performance_schema.replication_group_members;\" | tail -n1 ); [[ \"\$state\" == \"ONLINE\" ]]" )
if wait_until "this node ONLINE" "$WAIT_TIMEOUT" "$WAIT_INTERVAL" "${local_state_cmd[@]}"; then
log "Local member is ONLINE"
else
last_state="$(sql_last "SELECT COALESCE(MAX(CASE WHEN MEMBER_ID=@@server_uuid THEN MEMBER_STATE END),'MISSING') FROM performance_schema.replication_group_members;" || echo MISSING)"
log "Local member did not reach ONLINE within timeout, last_state=${last_state}"
exit 31
fi
# 4, wait for full cluster health, count == REQUIRED_COUNT and all ONLINE
cluster_ok_cmd=()
cluster_ok_cmd+=( bash -lc )
cluster_ok_cmd+=( "cnt=\$( $(command -v mysqlsh) --sql ${MYSQL_URI:+--uri \"$MYSQL_URI\"} -e \"SELECT COUNT(*) FROM performance_schema.replication_group_members;\" | tail -n1 ); onl=\$( $(command -v mysqlsh) --sql ${MYSQL_URI:+--uri \"$MYSQL_URI\"} -e \"SELECT COALESCE(SUM(MEMBER_STATE='ONLINE'),0) FROM performance_schema.replication_group_members;\" | tail -n1 ); [[ \"\$cnt\" == \"${REQUIRED_COUNT}\" && \"\$onl\" == \"${REQUIRED_COUNT}\" ]]" )
if wait_until "full cluster ONLINE" "$WAIT_TIMEOUT" "$WAIT_INTERVAL" "${cluster_ok_cmd[@]}"; then
log "Cluster is fully ONLINE, count=${REQUIRED_COUNT}"
else
cnt="$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;" || echo 0)"
onl="$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE='ONLINE'),0) FROM performance_schema.replication_group_members;" || echo 0)"
log "Cluster not fully ONLINE within timeout, count=${cnt} online=${onl}"
exit 32
fi
# 5, ensure recovery is finished, zero queued transactions and recovery channel off
recovery_ok(){
# queued txns for local member should be zero
q="$(sql_last "SELECT COALESCE(MAX(CASE WHEN MEMBER_ID=@@server_uuid THEN COUNT_TRANSACTIONS_IN_QUEUE END),0) FROM performance_schema.replication_group_member_stats;")"
# recovery channel should be OFF or not present
svc="$(sql_last "SELECT COALESCE(MAX(SERVICE_STATE),'OFF') FROM performance_schema.replication_connection_status WHERE CHANNEL_NAME='group_replication_recovery';")"
log "Recovery snapshot, queue=${q} recovery_channel_state=${svc}"
[[ "${q}" == "0" && "${svc}" == "OFF" ]]
}
if wait_until "recovery complete" "$WAIT_TIMEOUT" "$WAIT_INTERVAL" recovery_ok; then
log "Recovery complete"
else
log "Recovery not complete within timeout"
exit 33
fi
# 6, release NFS lock if on active site
if [[ "$ROLE" == "active" ]]; then
log "Active site, attempting lock release at $LOCK_DIR_PATH"
lock_release
else
log "Passive site, no lock to release"
fi
log "POST checks passed, node healthy"
exit 0
pythong checks:
import os
import sys
print(f"User: {os.getlogin()}")
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"sys.path:\n{sys.path}")
try:
import requests
print(f"✅ 'requests' is available, version: {requests.__version__}")
except ModuleNotFoundError:
print("❌ 'requests' module NOT found in this Python environment.")
V2:
import sys, getpass
print(f"User: {getpass.getuser()}")
print(f"Python executable: {sys.executable}")
print("Python version:", sys.version)
print("sys.path:")
for p in sys.path:
print(" ", p)
try:
import requests
print(f"✅ requests available, version: {requests.__version__}")
except Exception as e:
print("❌ requests not found")
print("Error:", e)
bash to install requests:
#!/bin/bash
echo "🔍 Detecting Python environment..."
PYTHON=$(which python3)
echo "Using Python: $PYTHON"
# Confirm version
"$PYTHON" --version || { echo "❌ Python not found"; exit 1; }
# Confirm pip exists
if ! "$PYTHON" -m pip --version >/dev/null 2>&1; then
echo "⚠️ pip not found for $PYTHON, attempting to install it..."
curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py
"$PYTHON" get-pip.py || { echo "❌ Failed to install pip"; exit 1; }
fi
echo "📦 Installing 'requests' module for $PYTHON ..."
"$PYTHON" -m pip install requests --upgrade --user || {
echo "❌ Failed to install 'requests'"
exit 1
}
echo "✅ requests installed successfully for Python: $PYTHON"
A bash install now:
#!/bin/bash
echo "Python executable: $(which python3)"
echo "Python version: $(python3 --version)"
echo "Checking if 'requests' is already installed..."
if python3 -c "import requests" &> /dev/null; then
echo "'requests' module is already installed."
else
echo "'requests' not found. Installing..."
pip3 install requests
if [ $? -ne 0 ]; then
echo "❌ Failed to install requests."
exit 1
fi
fi
echo "Verifying 'requests' installation..."
python3 -c "import requests; print('✅ requests version:', requests.__version__)"
python checks:
#!/usr/bin/env python3
import os, sys, json, socket, ssl, time, requests
from urllib.parse import urlparse
from requests.utils import get_environ_proxies
def j(x): print(json.dumps(x, indent=2))
def env_bool(name, default=True):
"""Return a boolean from common true/false strings."""
v = os.getenv(name)
if v is None:
return default
return str(v).strip().lower() in {"1", "true", "yes", "on"}
def parse_proxy(u):
if not u:
return None, None
p = urlparse(u)
return p.hostname, p.port or (443 if p.scheme == "https" else 80)
def try_get(label, sess, verify, url, user=None, pw=None):
t0 = time.monotonic()
try:
r = sess.get(
url,
auth=(user, pw) if user or pw else None,
headers={"Accept": "application/json"},
timeout=20,
verify=verify,
)
dt = round(time.monotonic() - t0, 3)
out = {"step": "https", "label": label, "status": r.status_code, "elapsed_sec": dt}
try:
js = r.json()
out["json_keys"] = list(js.keys())
except Exception:
out["body_snippet"] = r.text[:160]
j(out)
return r.status_code
except Exception as e:
dt = round(time.monotonic() - t0, 3)
j({"step": "https", "label": label, "error": str(e), "elapsed_sec": dt})
return None
# ------------------------------
# Environment
# ------------------------------
SN_HOST = os.getenv("SN_HOST", "").strip()
SN_USERNAME = os.getenv("SN_USERNAME", "").strip()
SN_PASSWORD = os.getenv("SN_PASSWORD", "").strip()
SN_VERIFY_SSL = env_bool("SN_VERIFY_SSL", True)
SN_PROXY_URL = os.getenv("SN_PROXY_URL", "").strip()
if not SN_HOST:
print("SN_HOST is missing", file=sys.stderr)
sys.exit(1)
base = f"https://{SN_HOST}"
url = f"{base}/api/now/table/sys_user?sysparm_limit=1"
# ------------------------------
# Diagnostics
# ------------------------------
j({
"python_executable": sys.executable,
"python_version": sys.version,
"sn_host": SN_HOST,
"sn_verify_ssl": SN_VERIFY_SSL,
"sn_proxy_url": SN_PROXY_URL or None,
"env_proxy": {k: os.getenv(k) for k in ["HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY", "http_proxy", "https_proxy", "no_proxy"]}
})
# DNS
try:
ip = socket.gethostbyname(SN_HOST)
j({"step": "dns", "resolved_ip": ip})
except Exception as e:
j({"step": "dns", "error": str(e)})
sys.exit(1)
# TCP proxy
phost, pport = parse_proxy(SN_PROXY_URL)
if phost:
try:
with socket.create_connection((phost, pport), timeout=8):
j({"step": "tcp_proxy", "host": phost, "port": pport, "ok": True})
except Exception as e:
j({"step": "tcp_proxy", "host": phost, "port": pport, "ok": False, "error": str(e)})
# TLS direct
try:
ctx = ssl.create_default_context()
if not SN_VERIFY_SSL:
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with socket.create_connection((SN_HOST, 443), timeout=6) as raw:
with ctx.wrap_socket(raw, server_hostname=SN_HOST) as ss:
cert = ss.getpeercert()
j({"step": "tls_direct", "ok": True, "subject": cert.get("subject", [])})
except Exception as e:
j({"step": "tls_direct", "ok": False, "error": str(e)})
# Requests sessions
s_env = requests.Session(); s_env.trust_env = True
s_forced = requests.Session()
if SN_PROXY_URL:
s_forced.trust_env = False
s_forced.proxies = {"http": SN_PROXY_URL, "https": SN_PROXY_URL}
else:
s_forced.trust_env = True
j({"step": "requests_auto_proxy", "resolved": get_environ_proxies(url)})
# HTTP attempts
try_get("env_proxy_verify_on", s_env, SN_VERIFY_SSL, url, SN_USERNAME, SN_PASSWORD)
try_get("env_proxy_verify_off", s_env, False, url, SN_USERNAME, SN_PASSWORD)
try_get("forced_proxy_verify_on", s_forced, SN_VERIFY_SSL, url, SN_USERNAME, SN_PASSWORD)
try_get("forced_proxy_verify_off", s_forced, False, url, SN_USERNAME, SN_PASSWORD)
print("probe_done")
latest DB PRE script:
#!/usr/bin/env bash
# PRE script for Morpheus DB nodes
# Purpose, prove it is safe to patch this node, then stop MySQL safely
# Features, strict mode, fixed PATH, binary checks, detailed logging, TESTMODE,
# active site NFS lock, cluster health checks, primary migration if needed,
# randomized backoff and final recheck. BigFix friendly.
set -euo pipefail
export PATH="/usr/sbin:/usr/bin:/sbin:/bin"
# ---------- required binaries ----------
which_or_fail(){ command -v "$1" >/dev/null 2>&1 || { echo "missing binary, $1" >&2; exit 1; }; }
for bin in hostname mountpoint ip systemctl grep sed tee date awk tail mysqlsh id whoami tty ps stat; do which_or_fail "$bin"; done
# ---------- config ----------
MYSQL_SVC="${MYSQL_SVC:-mysql}"
# NFS lock on active site only
NFS_PATH="${NFS_PATH:-/var/opt/morpheus/morpheus-ui}"
LOCK_DIR_NAME="${LOCK_DIR_NAME:-.db_patch_active_lock}"
LOCK_DIR_PATH="${LOCK_DIR_PATH:-$NFS_PATH/$LOCK_DIR_NAME}"
# cluster expectations and timings
REQUIRED_COUNT="${REQUIRED_COUNT:-3}"
JITTER_MIN="${JITTER_MIN:-2}"
JITTER_MAX="${JITTER_MAX:-10}"
# optional override, values, active, passive, auto
PATCH_ROLE="${PATCH_ROLE:-auto}"
# connection settings
MYSQL_PORT="${MYSQL_PORT:-3306}" # used when building host:port
# optional explicit connection, example, MYSQL_URI="dbuser:[email protected]:3306"
# dry run and debug
TESTMODE="${TESTMODE:-0}"
DEBUG="${DEBUG:-0}"
# logging
LOG_DIR="${LOG_DIR:-/data/morpheus_patching/logs}"
LOG_FILE="${LOG_FILE:-$LOG_DIR/mysqldb_patch.log}"
mkdir -p "$LOG_DIR" 2>/dev/null || true
if [[ "$DEBUG" == "1" ]]; then
export PS4='+ $(date "+%F %T") ${BASH_SOURCE##*/}:${LINENO}: '
set -x
fi
# send all output to console and log, print a separator banner
exec > >(tee -a "$LOG_FILE") 2>&1; printf '############################## %s ##############################\n' "$(date '+%F %T')" | tee -a "$LOG_FILE"
log(){ printf "%s %s\n" "$(date '+%F %T')" "$*"; }
trap 'rc=$?; log "[SUMMARY] PRE exiting rc=$rc TESTMODE=$TESTMODE"; exit $rc' EXIT
# identity capture with absolute paths and fallbacks
UID_NUM="$(/usr/bin/id -u 2>/dev/null || echo NA)"
RUN_AS="$([ -x /usr/bin/whoami ] && /usr/bin/whoami 2>/dev/null || echo NA)"
TTY_DEV="$(/usr/bin/tty 2>/dev/null || echo NA)"
PPID_NUM="${PPID:-NA}"
PPROC_NAME="$(/usr/bin/ps -o comm= -p "${PPID_NUM}" 2>/dev/null || echo NA)"
log "Starting $(basename "$0") uid=${UID_NUM} user=${RUN_AS} tty=${TTY_DEV} ppid=${PPID_NUM} pproc=${PPROC_NAME}"
log "PATH=$PATH"
log "LOG_FILE=$LOG_FILE"
# ---------- HOME and mysqlsh connection bootstrap ----------
if [[ -z "${HOME:-}" || "$HOME" = "/" ]]; then export HOME=/root; fi
log "HOME=$HOME"
if [[ -r "$HOME/.my.cnf" ]]; then
size="$(stat -c %s "$HOME/.my.cnf" 2>/dev/null || echo '?')"
log "~/.my.cnf found, size=${size} bytes"
else
log "WARNING, $HOME/.my.cnf not found or not readable, mysqlsh may not auto connect"
fi
log "mysqlsh version, $(mysqlsh --version 2>&1)"
log "mysqlsh path, $(command -v mysqlsh 2>/dev/null || echo not_found)"
# prefer MYSQL_URI if provided
mysqlsh_try_once(){
if [[ -n "${MYSQL_URI:-}" ]]; then
mysqlsh --sql --uri "$MYSQL_URI" -e "SELECT 1;" >/dev/null 2>&1 && return 0
fi
mysqlsh --sql -e "SELECT 1;" >/dev/null 2>&1 && return 0
mysqlsh --sql --host=127.0.0.1 --port="$MYSQL_PORT" -e "SELECT 1;" >/dev/null 2>&1 && return 0
return 1
}
if ! mysqlsh_try_once; then
log "ERROR, mysqlsh could not connect, set HOME correctly or provide MYSQL_URI"
exit 91
fi
# ---------- helpers ----------
short_host(){ hostname -s 2>/dev/null || hostname; }
sql_last(){
local q="$1"
if [[ -n "${MYSQL_URI:-}" ]]; then
mysqlsh --sql --uri "$MYSQL_URI" -e "$q" | tail -n1
else
mysqlsh --sql -e "$q" | tail -n1
fi
}
role_reason="unknown"
detect_role(){
if ! mountpoint -q "$NFS_PATH"; then role_reason="nfs_not_mounted"; echo "passive"; return 0; fi
local tf="$NFS_PATH/.rw_probe_$$"
if : > "$tf" 2>/dev/null; then rm -f "$tf" || true; role_reason="nfs_rw_ok"; echo "active"
else role_reason="nfs_ro_or_perm_denied"; echo "passive"; fi
}
lock_acquire(){
if [[ "$TESTMODE" == "1" ]]; then log "TESTMODE, would acquire DB patch lock at $LOCK_DIR_PATH"; return 0; fi
if mkdir "$LOCK_DIR_PATH" 2>/dev/null; then
printf "host=%s time=%s pid=%s\n" "$(short_host)" "$(date '+%F %T')" "$$" > "$LOCK_DIR_PATH/owner" || true
sync || true
log "Acquired DB patch lock at $LOCK_DIR_PATH"
return 0
fi
local owner="unknown"
[[ -f "$LOCK_DIR_PATH/owner" ]] && owner="$(cat "$LOCK_DIR_PATH/owner" 2>/dev/null || true)"
log "DB patch lock already held at $LOCK_DIR_PATH owner $owner"
return 1
}
rand_sleep_between(){
local min="$1" max="$2" span secs
if (( max <= min )); then secs="$min"; else span=$(( max - min + 1 )); secs=$(( min + RANDOM % span )); fi
log "Randomized backoff before stop, sleeping ${secs}s"
sleep "$secs"
}
require_cluster_ok(){
sql_last "SELECT CASE WHEN COUNT(*)=${REQUIRED_COUNT} AND SUM(MEMBER_STATE='ONLINE')=${REQUIRED_COUNT} THEN 'OK' ELSE 'NOT_OK' END FROM performance_schema.replication_group_members;"
}
# ---------- main flow ----------
# 1, classify site role and acquire lock if active
ROLE_DETECTED="$(detect_role)"
ROLE="$ROLE_DETECTED"
if [[ "$PATCH_ROLE" != "auto" ]]; then ROLE="$PATCH_ROLE"; fi
log "Detected site role ${ROLE_DETECTED} reason ${role_reason} using ${ROLE}"
if [[ "$ROLE" == "active" ]]; then
log "Attempting to acquire active site lock"
lock_acquire || { log "Failed to acquire active site lock"; exit 95; }
else
log "Passive site, skipping lock"
fi
# 2, initial cluster health gates
log "Checking cluster health gates"
status="$(require_cluster_ok || true)"
count="$(sql_last "SELECT COUNT(*) FROM performance_schema.replication_group_members;" || echo 0)"
bad_count="$(sql_last "SELECT COALESCE(SUM(MEMBER_STATE!='ONLINE'),0) FROM performance_schema.replication_group_members;" || echo 1)"
log "Cluster snapshot status=${status} count=${count} bad_count=${bad_count}"
if [[ "$status" != "OK" ]]; then log "Cluster not healthy status=${status}"; exit 2; fi
if [[ "$count" != "$REQUIRED_COUNT" ]]; then log "Unexpected member count ${count} expected ${REQUIRED_COUNT}"; exit 3; fi
if [[ "$bad_count" != "0" ]]; then log "Some members not ONLINE bad_count=${bad_count}"; exit 4; fi
# 3, if this node is primary, promote a secondary
this_is_primary="$(sql_last "SELECT CASE WHEN EXISTS(SELECT 1 FROM performance_schema.replication_group_members WHERE MEMBER_ID = @@server_uuid AND MEMBER_ROLE='PRIMARY') THEN 1 ELSE 0 END;" || echo 0)"
if [[ "$this_is_primary" == "1" ]]; then
target="$(sql_last "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE!='PRIMARY' ORDER BY MEMBER_HOST LIMIT 1;" || echo "")"
if [[ -z "$target" ]]; then log "No secondary found to promote"; exit 5; fi
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would promote ${target}:${MYSQL_PORT} to PRIMARY"
else
log "Promoting ${target}:${MYSQL_PORT} to PRIMARY"
/usr/bin/mysqlsh --js -e "var c = dba.getCluster(); c.setPrimaryInstance('${target}:${MYSQL_PORT}');" \
|| { log "setPrimaryInstance call failed"; exit 6; }
fi
newp="$(sql_last "SELECT MEMBER_HOST FROM performance_schema.replication_group_members WHERE MEMBER_ROLE='PRIMARY';" || echo "")"
if [[ "$newp" != "$target" ]]; then log "Primary promotion verification failed, expected ${target}, got ${newp:-empty}"; exit 6; fi
log "Primary is now ${newp}"
else
log "This node is not PRIMARY, continuing"
fi
# 4, refuse to proceed if MySQL already stopped
if ! systemctl is-active --quiet "$MYSQL_SVC"; then
log "MySQL service ${MYSQL_SVC} is not active on this node, refusing to proceed"
exit 7
fi
# 5, final race protection, jitter then recheck health
rand_sleep_between "$JITTER_MIN" "$JITTER_MAX"
final_status="$(require_cluster_ok || true)"
if [[ "$final_status" != "OK" ]]; then
log "Cluster changed during pre, final_status=${final_status}, aborting to avoid overlap"
exit 20
fi
log "Final health check OK, proceeding to stop MySQL"
# 6, stop MySQL or simulate
if [[ "$TESTMODE" == "1" ]]; then
log "TESTMODE, would stop service ${MYSQL_SVC}"
else
log "Stopping service ${MYSQL_SVC}"
systemctl stop "$MYSQL_SVC"
fi
log "PRE checks passed, node is safe to patch"
exit 0