-
-
Save mrjk/2947e908980e04ab89b1 to your computer and use it in GitHub Desktop.
Wrapper for pgpools' pcp tools to help you manage your pgpool setup and postgresql cluster. See usage in the comment below.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# pgpool-II replication manager | |
# | |
# Interfaces with pgpool's pcp command-line tools to provide access to common functions for managing | |
# load-balancing and failover. | |
# | |
# mrjk.78 at the famous google mail | |
# 2014-10-13 | |
# [email protected] | |
# 2011-08-28 | |
# | |
# PCP configuration | |
pcp_host="127.0.0.1" | |
pcp_port="9898" | |
pcp_username="admin" | |
pcp_password="admin" | |
pcp_timeout="10" | |
# Health check uses psql to connect to each backend server. Specify options required to connect here | |
export PGPASSWORD=repmgr | |
psql_healthcheck_opts="-U repmgr" | |
# Default options to send to pcp commands | |
pcp_cmd_preamble="$pcp_timeout $pcp_host $pcp_port $pcp_username $pcp_password" | |
# Postgresql system user to connect to query repmgr | |
remote_user="postgres" | |
# NEW | |
repmgr_pg_root="root" | |
repmgr_pg_user="postgres" | |
repmgr_pg_data_dir="/var/lib/pgsql/9.4/data" | |
repmgr_bin_dir="/usr/pgsql-9.4/bin" | |
repmgr_conf="/etc/repmgr/9.4/repmgr.conf" | |
# | |
# Runs abitrary pcp command with the preamble | |
# | |
_pcp() | |
{ | |
cmd=$1 | |
shift | |
$cmd $pcp_cmd_preamble $* | |
} | |
# | |
# Prints the total number of pgpool nodes. | |
# | |
_get_node_count() | |
{ | |
pcp_node_count $pcp_cmd_preamble | |
} | |
# | |
# Prints out node information for the specified pgpool node | |
# | |
_get_node_info() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _get_node_info <node id>" >&2 | |
return 99 | |
fi | |
pcp_node_info $pcp_cmd_preamble $1 | |
} | |
# | |
# Get node IP | |
# | |
_get_node_ip() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _get_node_ip <node id>" >&2 | |
return 99 | |
fi | |
node_info=($( pcp_node_info $pcp_cmd_preamble $1 )) | |
echo ${node_info[0]} | |
} | |
# | |
# Outputs the replication lag (in bytes) between the specified master and slave | |
# | |
_get_replication_lag() | |
{ | |
if [ -z $2 ]; then | |
echo "Usage: $0 _get_replication_lag <master node id> <slave node id>" >&2 | |
return 99 | |
fi | |
# Get connection information for nodes | |
master_node_info=($(_get_node_info $1)) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node info for node $1" >&2 | |
return 1 | |
fi | |
slave_node_info=($(_get_node_info $2)) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node info for node $2" >&2 | |
return 1 | |
fi | |
for n in $1 $2; do | |
if [ $(_is_node_alive $n) -eq 0 ]; then | |
echo "ERROR: Node $n is not available. Unable to calculate lag." >&2 | |
return 1 | |
fi | |
done | |
export PGCONNECT_TIMEOUT=2 | |
# Get xlog locations for master and slaves | |
master_xlog_loc=$(psql $psql_healthcheck_opts -h "${master_node_info[0]}" -p "${master_node_info[1]}" -Atc "SELECT pg_current_xlog_location();") | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting xlog location from node $1" >&2 | |
return 1 | |
fi | |
slave_xlog_loc=$(psql $psql_healthcheck_opts -h "${slave_node_info[0]}" -p "${slave_node_info[1]}" -Atc "SELECT pg_last_xlog_replay_location();") | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting xlog location from node $2" >&2 | |
return 1 | |
fi | |
# Number of bytes behind | |
echo $(($(_xlog_to_bytes $master_xlog_loc) - $(_xlog_to_bytes $slave_xlog_loc))) | |
} | |
# | |
# Converts specified xlog number to byte location | |
# | |
_xlog_to_bytes() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _xlog_to_bytes <xlog loc>" >&2 | |
echo " Eg.: $0 _xlog_to_bytes 0/2BC825C0" >&2 | |
return 99 | |
fi | |
logid="${1%%/*}" | |
offset="${1##*/}" | |
echo $((0xFFFFFF * 0x$logid + 0x$offset)) | |
} | |
# | |
# Prints whether the postgresql service on the specified node is responding. | |
# | |
_is_node_alive() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _is_node_alive <node id>" >&2 | |
return 99 | |
fi | |
# Get node connection information | |
node_info=($(_get_node_info $1)) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node info for node $1" >&2 | |
return 1 | |
fi | |
export PGCONNECT_TIMEOUT=2 | |
result=$(psql $psql_healthcheck_opts -h "${node_info[0]}" -p "${node_info[1]}" -Atc "SELECT 1;" 2>/dev/null) | |
if [ "$result" == "1" ]; then | |
echo 1 | |
return 1 | |
else | |
echo 0 | |
return 0 | |
fi | |
} | |
# | |
# Get the node ID of the first master | |
# | |
_get_master_node() | |
{ | |
# Get total number of nodes | |
nodes=$(_get_node_count) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node count: $nodes" >&2 | |
exit 1 | |
fi | |
c=0 | |
# Loop through each node to check if it's the master | |
while [ $c -lt $nodes ]; do | |
if [ "$(_is_standby $c)" == "0" ]; then | |
echo $c | |
return 0 | |
fi | |
let c=c+1 | |
done | |
echo "-1" | |
return 1 | |
} | |
# | |
# Checks if the node is in postgresql recovery mode (ie. if it is a slave) | |
# | |
_is_standby() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _is_standby <node id>" >&2 | |
return 99 | |
fi | |
# Get node connection information | |
node_info=($(_get_node_info $1)) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node info for node $1" >&2 | |
return 1 | |
fi | |
export PGCONNECT_TIMEOUT=2 | |
result=$(psql $psql_healthcheck_opts -h "${node_info[0]}" -p "${node_info[1]}" -Atc "SELECT pg_is_in_recovery();" 2>/dev/null) | |
if [ "$result" == "t" ]; then | |
echo 1 | |
return 1 | |
else | |
echo 0 | |
return 0 | |
fi | |
} | |
# | |
# Attaches the specified node to the pool | |
# | |
pool_attach() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 attach <node id>" >&2 | |
return 99 | |
fi | |
pcp_attach_node $pcp_cmd_preamble $1 | |
} | |
# | |
# Detaches the specified node from the pool | |
# | |
pool_detach() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 detach <node id>" >&2 | |
return 99 | |
fi | |
pcp_detach_node $pcp_cmd_preamble $1 | |
} | |
# | |
# Recovers the specified node (restores it from current master and re-attaches) | |
# | |
pool_recover() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 recover <node id>" >&2 | |
return 99 | |
fi | |
pcp_recovery_node $pcp_cmd_preamble $1 | |
} | |
# | |
# Prints the status of the specified node in human readable format. | |
# | |
_get_node_status() | |
{ | |
if [ -z $1 ]; then | |
echo "Usage: $0 _get_node_status <node id>" >&2 | |
return 99 | |
fi | |
node_info=($(_get_node_info $1)) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node info for node $1" >&2 | |
else | |
node_role="" | |
node_replication_lag="" | |
node_alive="" | |
case "$(_is_node_alive $1)" in | |
1) | |
node_alive="Up" | |
;; | |
*) | |
node_alive="Down" | |
;; | |
esac | |
if [ "$node_alive" == "Up" ]; then | |
# Find out what role this node has | |
if [ "$(_is_standby $1)" == "1" ]; then | |
node_role="Slave" | |
# Calculation replication lag | |
master_node=$(_get_master_node) | |
node_replication_lag=$(_get_replication_lag $master_node $1) | |
if [ $? -eq 0 ]; then | |
node_replication_lag="$node_replication_lag bytes" | |
else | |
node_replication_log="Unknown" | |
fi | |
else | |
node_role="Master" | |
fi | |
fi | |
case "${node_info[2]}" in | |
3) | |
node_status="detached from pool" | |
;; | |
2) | |
node_status="in pool and connected" | |
;; | |
1) | |
node_status="in pool" | |
;; | |
*) | |
node_status="Unknown" | |
;; | |
esac | |
# Print status information about this node | |
echo "Node: $1" | |
echo "Host: ${node_info[0]}" | |
echo "Port: ${node_info[1]}" | |
echo "Weight: ${node_info[3]}" | |
echo "Status: $node_alive, $node_status (${node_info[2]})" | |
[ -n "$node_role" ] && echo "Role: $node_role" | |
[ -n "$node_replication_lag" ] && echo "Replication lag: $node_replication_lag" | |
echo "" | |
fi | |
} | |
# | |
# Prints out the status of all pgpool nodes in human readable form. | |
# | |
pool_show() | |
{ | |
# Get total number of nodes | |
nodes=$(_get_node_count) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node count: $nodes" >&2 | |
exit 1 | |
fi | |
c=0 | |
# Loop through each node to retrieve info | |
while [ $c -lt $nodes ]; do | |
_get_node_status $c | |
let c=c+1 | |
done | |
} | |
# | |
# Prints out the status of all node, easier to analyse in a script | |
# | |
pool_status() | |
{ | |
# Get the second argument | |
display="${1:-all}" | |
shift | |
if [ "${display}" == "master" ]; then | |
display_master=1 | |
display_slave=0 | |
elif [ "${display}" == "slave" ]; then | |
display_master=0 | |
display_slave=1 | |
elif [ "${display}" == "all" ]; then | |
display_master=0 | |
display_slave=0 | |
else | |
echo "ERROR: Unknown argument: ${display}" >&2 | |
exit 1 | |
fi | |
# Get total number of nodes | |
nodes=$(_get_node_count) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node count: $nodes" >&2 | |
exit 1 | |
fi | |
# Check what to display | |
echo -ne "Node\tHost\t\tPort\tAlive\tStandBy\tStatus\n" | |
# Loop through each node to retrieve info | |
c=0 | |
while [ $c -lt $nodes ]; do | |
# Get infos | |
node_info=($(_get_node_info $c)) | |
# Display info | |
if [[ "${display_master}" == 1 && | |
"$(_is_node_alive $c)" == "1" && | |
"$(_is_standby $c)" != "1" && | |
"${node_info[2]}" != "3" ]]; then | |
# Display active master | |
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}" | |
elif [[ "${display_slave}" == 1 && | |
"$(_is_node_alive $c)" == "1" && | |
"$(_is_standby $c)" == "1" && | |
"${node_info[2]}" != "3" ]]; then | |
# Display active slaves | |
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}" | |
elif [[ "${display_master}" == 0 && "${display_slave}" == 0 ]]; then | |
# Display all (even inactive) | |
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}" | |
fi | |
# Node: $c | |
# Host: ${node_info[0]} | |
# Port: ${node_info[1]} | |
# Alive: $(_is_node_alive $c) | |
# Standby: $(_is_standby $c) | |
# Status: ${node_info[2]} | |
# Status: | |
# 0 - This state is only used during the initialization. PCP will never display it. | |
# 1 - Node is up. No connections yet. | |
# 2 - Node is up. Connections are pooled. | |
# 3 - Node is down. | |
let c=c+1 | |
done | |
} | |
# | |
# Prints out the status of all node, easier to analyse in a script | |
# | |
rep_status() | |
{ | |
# Get total number of nodes | |
nodes=$(_get_node_count) | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed getting node count: $nodes" >&2 | |
exit 1 | |
fi | |
# Loop over | |
c=0 | |
while [ $c -lt $nodes ]; do | |
# Get infos | |
node_info=($(_get_node_info $c)) | |
# Launch the command | |
echo "Host: ${node_info[0]}" | |
ssh ${repmgr_pg_user}@${node_info[0]} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} cluster show" | |
echo "" | |
# increment | |
let c=c+1 | |
done | |
} | |
# | |
# Stop postgreSQL instance | |
# | |
pg_start() | |
{ | |
# Check node id arguments | |
local promote=$( _get_node_ip ${1} ) | |
# Must be an IP | |
# Test if argument is an IP | |
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
echo "ERROR: argument is not a valid IP: ${promote}" >&2 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${promote} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${promote}" >&2 | |
exit 1 | |
fi | |
# Promote the slave | |
#ssh ${repmgr_pg_root}@${promote} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote" | |
ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl start postgresql-9.4 || echo FAIL" | |
# Show the logs | |
#echo "Status:" | |
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl status postgresql-9.4" | |
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/netstat -lntp | grep postgres" | |
} | |
# | |
# Stop postgreSQL instance | |
# | |
pg_stop() | |
{ | |
# Check node id arguments | |
local promote=$( _get_node_ip ${1} ) | |
# Must be an IP | |
# Test if argument is an IP | |
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
echo "ERROR: argument is not a valid IP: ${promote}" >&2 | |
exit 1 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${promote} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${promote}" >&2 | |
exit 1 | |
fi | |
# Promote the slave | |
#ssh ${repmgr_pg_root}@${promote} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote" | |
ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl stop postgresql-9.4 || echo FAIL" | |
} | |
# | |
# Promote a slave | |
# | |
rep_standby_clone() | |
{ | |
# Check node id arguments | |
local standby=$( _get_node_ip ${1} ) | |
local master=$( _get_node_ip ${2} ) | |
# Must be an IP | |
# Test if argument is an IP | |
if [[ ! ( "${standby}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ && "${master}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ) ]]; then | |
echo "ERROR: argument is not a valid IP: ${standby}" >&2 | |
exit 1 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${standby} exit && ssh -q ${repmgr_pg_user}@${standby} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${standby}" >&2 | |
exit 1 | |
fi | |
# Stop the postgreSQL daemon | |
pg_stop ${1} | |
# Lanch the repmgr recovery | |
#ssh ${repmgr_pg_root}@${standby} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby standby" | |
ssh ${repmgr_pg_user}@${standby} "/usr/bin/rm -r ${repmgr_pg_data_dir}/* 2>/dev/null " | |
ssh ${repmgr_pg_user}@${standby} "${repmgr_bin_dir}/repmgr -D ${repmgr_pg_data_dir} -d repmgr -U repmgr --verbose standby clone ${master} 2>> /tmp/test || echo 'FAIL'" | |
#rm -rf /var/lib/pgsql/9.4/data/* | |
#repmgr -D /var/lib/pgsql/9.4/data/ -d repmgr -U repmgr --verbose standby clone 192.168.121.144 | |
# Show the logs | |
#echo "Repmgr log file: " | |
#ssh ${repmgr_pg_root}@${standby} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log" | |
# Start the postgreSQL daemon | |
pg_start ${1} | |
} | |
# | |
# Register a slave | |
# | |
rep_standby_register() | |
{ | |
# Check node id arguments | |
local register=$( _get_node_ip ${1} ) | |
# Test if argument is an IP | |
if [[ ! "${register}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
echo "ERROR: argument is not a valid IP: ${register}" >&2 | |
exit 1 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${register} exit && ssh -q ${repmgr_pg_user}@${register} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${register}" >&2 | |
exit 1 | |
fi | |
# Lanch the repmgr recovery | |
ssh ${repmgr_pg_user}@${register} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby register || echo 'FAIL'" | |
# Show the logs | |
echo "Repmgr log file: " | |
ssh ${repmgr_pg_root}@${register} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log" | |
} | |
# | |
# Promote a slave | |
# | |
rep_standby_promote() | |
{ | |
# Check node id arguments | |
local promote=$( _get_node_ip ${1} ) | |
# Test if argument is an IP | |
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
echo "ERROR: argument is not a valid IP: ${promote}" >&2 | |
exit 1 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${promote} exit && ssh -q ${repmgr_pg_user}@${promote} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${promote}" >&2 | |
exit 1 | |
fi | |
# Lanch the repmgr recovery | |
ssh ${repmgr_pg_user}@${promote} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote || echo 'FAIL' " | |
# Show the logs | |
#echo "Repmgr log file: " | |
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log" | |
} | |
# | |
# Promote a master | |
# | |
rep_master_register() | |
{ | |
# Check node id arguments | |
local master=${1} | |
# Test if argument is an IP | |
if [[ ! "${master}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
echo "ERROR: argument is not a valid IP: ${master}" >&2 | |
exit 1 | |
fi | |
# Test ssh connection | |
ssh -q ${repmgr_pg_root}@${master} exit && ssh -q ${repmgr_pg_user}@${master} exit | |
if [ $? -gt 0 ]; then | |
echo "ERROR: Failed ssh connexion: ${master}" >&2 | |
exit 1 | |
fi | |
# Lanch the repmgr recovery | |
#ssh ${repmgr_pg_root}@${master} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote" | |
ssh ${repmgr_pg_user}@${master} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} master register" | |
# Show the logs | |
echo "Repmgr log file: " | |
ssh ${repmgr_pg_root}@${master} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log" | |
} | |
# | |
# Prints out the human overall status | |
# | |
show() | |
{ | |
echo "PgPool-II Status:" | |
echo "=====================" | |
#pool_show | |
#echo "" | |
pool_status | |
echo "" | |
echo "Repmgr Status:" | |
echo "=====================" | |
rep_status | |
} | |
# Run function | |
if [ ! "$(type -t $1)" ]; then | |
echo "Usage $0 <option>" >&2 | |
echo "" >&2 | |
echo "Available options:" >&2 | |
echo "$(compgen -A function |grep -v '^_')" >&2 | |
exit 99 | |
else | |
cmd=$1 | |
shift | |
$cmd $* | |
exit $? | |
fi |
Implemented repmgr in the command line. It offers a nice tool to control (very) easily your PostgreSQL replication tool.
I'll push that in production, just for fun ^^ But I do not yet recommend you to do the same ;-)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I added master and slave commands, in way to have a simple call to get only active nodes