-
-
Save akorn/644855ddaa8065f564be to your computer and use it in GitHub Desktop.
This script can transfer large directory structures with parallel rsync workers. Example command line: `rsync_parallel . -- -aHSAX --exclude '*13' . /tmp/2/.`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# | |
# Copyright (c) 2014, 2020 by Dr. András Korn. Implements the basic idea of a similar script by Robert Coup (2013). | |
# License: GPLv3 | |
function usage() { | |
echo 'Usage: | |
rsync_parallel [--parallel=N] <args to find(1) to generate list of stuff to transfer> -- <args to rsync> | |
Options: | |
--parallel=N Use N parallel processes for transfer. Defaults to $(nproc) if nproc is available; otherwise to 10. | |
Notes: | |
* Should properly handle filenames with embedded newlines. | |
* Use with key based SSH authentication to avoid repeated password prompts. | |
* Unfortunately, the only way to handle funny filenames involves | |
resorting to find(1), so rsync_parallel is not a drop-in replacement | |
for rsync(1). It will call rsync(1) with -0 --files-from=-, and feed it | |
the list of files found by find based on the find(1) arguments you gave | |
on the command line. You need to make sure the paths output by find will | |
be valid relative to the source directory you pass to rsync. | |
* Depends on find -printf, so probably GNU find(1). | |
* Exit status is the highest of all child rsync exit statuses, or 111 if | |
invoked incorrectly, or 127 if at least one of the workers aborted with | |
an unkown exit status. | |
Example: | |
rsync_parallel --parallel=42 . -- -avHPSAX . user@remote:/some/path/. | |
' | |
} | |
typeset -a RSYNCBYTES # an array to count the number of bytes each rsync child has been requested to transfer | |
typeset -a RSYNCFD # an array whose members are file descriptors connected to workers' stdins | |
typeset -a findargs # we'll parse find(1) arguments into this array | |
typeset -a rsyncargs # and rsync(1) arguments into this one | |
typeset -A STATUS_REPORTED # a hash to keep track of which workers' status we already printed | |
typeset -A inode_worker # a hash that keeps track of which worker we assigned which inode to; needed to allow rsync -H to work | |
typeset -a WORKER_STATUS | |
nr_children=0 | |
GLOBAL_EXIT_STATUS=0 | |
hardlinks=0 # set to 1 if rsync args apparently include -H or --hardlinks | |
TMPDIR=$(mktemp -d) || { echo "FATAL: unable to create temporary directory." >&2; exit 111 } | |
trap "rm -rf $TMPDIR" EXIT | |
# The only way to obtain the exit statuses from the rsync processes is to write them into tempfiles :( | |
function worker() { | |
local ret | |
trap 'rm $TMPDIR/worker${i}.pid' EXIT | |
echo $$ >$TMPDIR/worker${i}.pid | |
rsync -0 --files-from=- $rsyncargs | |
ret=$? | |
echo $ret >$TMPDIR/worker${i}.status | |
} | |
# The file list we'll obtain below will be piped into this load-balancing | |
# function that chooses which rsync child to pass the incoming filename to. | |
# It chooses the one with the fewest bytes allocated to it so far. | |
function balance() { | |
trap - EXIT | |
local min minworker | |
local IFS="" | |
while read -rd '' inum; do | |
read -rd '' size | |
read -rd '' name | |
min=${${(n)RSYNCBYTES}[1]} | |
minworker=${RSYNCBYTES[(I)$min]} | |
if ((hardlinks)); then | |
if [[ -n "$inode_worker[$inum]" ]]; then | |
minworker=$inode_worker[$inum] | |
else | |
inode_worker[$inum]=$minworker | |
fi | |
fi | |
print -rN -u $RSYNCFD[$minworker] "$name" | |
((RSYNCBYTES[$minworker]+=$size)) | |
done | |
} | |
# Obtain file list ("length filename" tuples, one per line). | |
# It would be tempting to use rsync itself for this, with --no-v --dry-run and | |
# an out-format of "%l %n", but rsync will escape some characters in filenames | |
# and not recognize the same escapes in --files-from; so we need to use | |
# find(1). This has the drawback of also printing filenames that will be | |
# excluded from the transfer using --exclude. | |
function generate_file_list() { | |
trap - EXIT | |
find $findargs -printf "%i\0%s\0%p\0" | |
} | |
function sigchld_handler() { | |
trap - EXIT | |
((nr_children--)) | |
echo "INFO: a worker exited; $nr_children still running." >&2 | |
local found=0 | |
for i in {1..$PARALLEL}; do | |
((STATUS_REPORTED[$i])) && continue | |
if ! [[ -e $TMPDIR/worker${i}.pid ]]; then | |
found=1 | |
if [[ -r $TMPDIR/worker{$i}.status ]]; then | |
WORKER_STATUS[$i]=$(<$TMPDIR/worker${i}.status) | |
((WORKER_STATUS[$i])) && echo "ERROR: worker $i exited with error $WORKER_STATUS[$i]." >&2 | |
else | |
WORKER_STATUS[$i]=127 | |
echo "ERROR: worker $i exited unexpectedly/abnormally; assuming exit status 127." >&2 | |
fi | |
[[ $WORKER_STATUS[$i] -gt $GLOBAL_EXIT_STATUS ]] && GLOBAL_EXIT_STATUS=$WORKER_STATUS[$i] | |
STATUS_REPORTED[$i]=1 | |
continue | |
fi | |
done | |
if ! ((found)); then | |
echo "WARNING: stray SIGCHLD; apparently a worker exited but I don't know which. Global exit status could be wrong. $(echo $TMPDIR/*)" >&2 | |
fi | |
} | |
if [[ "$1" == --parallel=* ]]; then | |
PARALLEL="${1##*=}"; shift | |
elif [[ -x /usr/bin/nproc ]]; then | |
PARALLEL=$(nproc) | |
else | |
PARALLEL=10 | |
fi | |
# get findargs | |
while [[ -n $1 ]] && ! [[ $1 = -- ]]; do | |
findargs=($findargs $1) | |
shift | |
done | |
[[ $1 = -- ]] && shift | |
# anything left over is args for rsync | |
while [[ -n $1 ]]; do | |
{ [[ $1 == -H ]] || [[ $1 == -[^-]*H* ]] || [[ $1 == --hard-links ]] } && hardlinks=1 | |
# This is imperfect because "-*H*" can occur in a path specification, | |
# but it fails safely. I don't want to reimplement much of the rsync | |
# option parser just to catch this corner case. False positive | |
# detection of --hard-links results in higher memory consumption for | |
# the script, and possibly reduced parallelism if the same | |
# inode number occurs on different files (on different filesystems) | |
# being transferred. | |
[[ $1 == --no-hard-links ]] && hardlinks=0 | |
# Again, this is imperfect because if we're already specifying paths, | |
# a request to transfer a directory called --no-hard-links would | |
# cause the hardlink logic to be disabled. If you have such | |
# pathological filenames, change the script. | |
rsyncargs=($rsyncargs $1) | |
shift | |
done | |
# You didn't specify any args for rsync? Probably not what you meant. | |
[[ -z $rsyncargs ]] && usage && exit 111 | |
echo "INFO: Using up to $PARALLEL processes for transfer." >&2 | |
# spawn rsync children, each reading the list of files it should transfer from stdin. | |
for i in {1..$PARALLEL}; do | |
exec {myfd}>>(worker) | |
((nr_children++)) | |
RSYNCFD[$i]=$myfd | |
RSYNCBYTES[$i]=0 | |
done | |
generate_file_list | balance | |
trap "sigchld_handler" CHLD | |
for i in {1..$PARALLEL}; do | |
myfd=$RSYNCFD[$i] | |
exec {myfd}>&- | |
done | |
zmodload zsh/zselect | |
echo "Waiting for workers to exit." >&2 | |
# TODO: properly test whether the main script can exit prematurely and leave workers running | |
while ((nr_children)) && [[ -n "$(echo $TMPDIR/*.pid(N))" ]]; do | |
zselect -t 100 | |
done | |
exit $GLOBAL_EXIT_STATUS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As per previously mentioned, but more direct and facilitating non-modified use: On macOS we need to define the location of
find
for the GNU variety via MacPorts or other package managers. Could you please insert a path definition so we can use non-standard path or namedfind
? for example, I'd like to see an added arg of something like--find_path=/opt/local/bin/gfind
.