-
-
Save akorn/644855ddaa8065f564be to your computer and use it in GitHub Desktop.
#!/bin/zsh | |
# | |
# Copyright (c) 2014, 2020 by Dr. András Korn. Implements the basic idea of a similar script by Robert Coup (2013). | |
# License: GPLv3 | |
function usage() { | |
echo 'Usage: | |
rsync_parallel [--parallel=N] <args to find(1) to generate list of stuff to transfer> -- <args to rsync> | |
Options: | |
--parallel=N Use N parallel processes for transfer. Defaults to $(nproc) if nproc is available; otherwise to 10. | |
Notes: | |
* Should properly handle filenames with embedded newlines. | |
* Use with key based SSH authentication to avoid repeated password prompts. | |
* Unfortunately, the only way to handle funny filenames involves | |
resorting to find(1), so rsync_parallel is not a drop-in replacement | |
for rsync(1). It will call rsync(1) with -0 --files-from=-, and feed it | |
the list of files found by find based on the find(1) arguments you gave | |
on the command line. You need to make sure the paths output by find will | |
be valid relative to the source directory you pass to rsync. | |
* Depends on find -printf, so probably GNU find(1). | |
* Exit status is the highest of all child rsync exit statuses, or 111 if | |
invoked incorrectly, or 127 if at least one of the workers aborted with | |
an unkown exit status. | |
Example: | |
rsync_parallel --parallel=42 . -- -avHPSAX . user@remote:/some/path/. | |
' | |
} | |
typeset -a RSYNCBYTES # an array to count the number of bytes each rsync child has been requested to transfer | |
typeset -a RSYNCFD # an array whose members are file descriptors connected to workers' stdins | |
typeset -a findargs # we'll parse find(1) arguments into this array | |
typeset -a rsyncargs # and rsync(1) arguments into this one | |
typeset -A STATUS_REPORTED # a hash to keep track of which workers' status we already printed | |
typeset -A inode_worker # a hash that keeps track of which worker we assigned which inode to; needed to allow rsync -H to work | |
typeset -a WORKER_STATUS | |
nr_children=0 | |
GLOBAL_EXIT_STATUS=0 | |
hardlinks=0 # set to 1 if rsync args apparently include -H or --hardlinks | |
TMPDIR=$(mktemp -d) || { echo "FATAL: unable to create temporary directory." >&2; exit 111 } | |
trap "rm -rf $TMPDIR" EXIT | |
# The only way to obtain the exit statuses from the rsync processes is to write them into tempfiles :( | |
function worker() { | |
local ret | |
trap 'rm $TMPDIR/worker${i}.pid' EXIT | |
echo $$ >$TMPDIR/worker${i}.pid | |
rsync -0 --files-from=- $rsyncargs | |
ret=$? | |
echo $ret >$TMPDIR/worker${i}.status | |
} | |
# The file list we'll obtain below will be piped into this load-balancing | |
# function that chooses which rsync child to pass the incoming filename to. | |
# It chooses the one with the fewest bytes allocated to it so far. | |
function balance() { | |
trap - EXIT | |
local min minworker | |
local IFS="" | |
while read -rd '' inum; do | |
read -rd '' size | |
read -rd '' name | |
min=${${(n)RSYNCBYTES}[1]} | |
minworker=${RSYNCBYTES[(I)$min]} | |
if ((hardlinks)); then | |
if [[ -n "$inode_worker[$inum]" ]]; then | |
minworker=$inode_worker[$inum] | |
else | |
inode_worker[$inum]=$minworker | |
fi | |
fi | |
print -rN -u $RSYNCFD[$minworker] "$name" | |
((RSYNCBYTES[$minworker]+=$size)) | |
done | |
} | |
# Obtain file list ("length filename" tuples, one per line). | |
# It would be tempting to use rsync itself for this, with --no-v --dry-run and | |
# an out-format of "%l %n", but rsync will escape some characters in filenames | |
# and not recognize the same escapes in --files-from; so we need to use | |
# find(1). This has the drawback of also printing filenames that will be | |
# excluded from the transfer using --exclude. | |
function generate_file_list() { | |
trap - EXIT | |
find $findargs -printf "%i\0%s\0%p\0" | |
} | |
function sigchld_handler() { | |
trap - EXIT | |
((nr_children--)) | |
echo "INFO: a worker exited; $nr_children still running." >&2 | |
local found=0 | |
for i in {1..$PARALLEL}; do | |
((STATUS_REPORTED[$i])) && continue | |
if ! [[ -e $TMPDIR/worker${i}.pid ]]; then | |
found=1 | |
if [[ -r $TMPDIR/worker{$i}.status ]]; then | |
WORKER_STATUS[$i]=$(<$TMPDIR/worker${i}.status) | |
((WORKER_STATUS[$i])) && echo "ERROR: worker $i exited with error $WORKER_STATUS[$i]." >&2 | |
else | |
WORKER_STATUS[$i]=127 | |
echo "ERROR: worker $i exited unexpectedly/abnormally; assuming exit status 127." >&2 | |
fi | |
[[ $WORKER_STATUS[$i] -gt $GLOBAL_EXIT_STATUS ]] && GLOBAL_EXIT_STATUS=$WORKER_STATUS[$i] | |
STATUS_REPORTED[$i]=1 | |
continue | |
fi | |
done | |
if ! ((found)); then | |
echo "WARNING: stray SIGCHLD; apparently a worker exited but I don't know which. Global exit status could be wrong. $(echo $TMPDIR/*)" >&2 | |
fi | |
} | |
if [[ "$1" == --parallel=* ]]; then | |
PARALLEL="${1##*=}"; shift | |
elif [[ -x /usr/bin/nproc ]]; then | |
PARALLEL=$(nproc) | |
else | |
PARALLEL=10 | |
fi | |
# get findargs | |
while [[ -n $1 ]] && ! [[ $1 = -- ]]; do | |
findargs=($findargs $1) | |
shift | |
done | |
[[ $1 = -- ]] && shift | |
# anything left over is args for rsync | |
while [[ -n $1 ]]; do | |
{ [[ $1 == -H ]] || [[ $1 == -[^-]*H* ]] || [[ $1 == --hard-links ]] } && hardlinks=1 | |
# This is imperfect because "-*H*" can occur in a path specification, | |
# but it fails safely. I don't want to reimplement much of the rsync | |
# option parser just to catch this corner case. False positive | |
# detection of --hard-links results in higher memory consumption for | |
# the script, and possibly reduced parallelism if the same | |
# inode number occurs on different files (on different filesystems) | |
# being transferred. | |
[[ $1 == --no-hard-links ]] && hardlinks=0 | |
# Again, this is imperfect because if we're already specifying paths, | |
# a request to transfer a directory called --no-hard-links would | |
# cause the hardlink logic to be disabled. If you have such | |
# pathological filenames, change the script. | |
rsyncargs=($rsyncargs $1) | |
shift | |
done | |
# You didn't specify any args for rsync? Probably not what you meant. | |
[[ -z $rsyncargs ]] && usage && exit 111 | |
echo "INFO: Using up to $PARALLEL processes for transfer." >&2 | |
# spawn rsync children, each reading the list of files it should transfer from stdin. | |
for i in {1..$PARALLEL}; do | |
exec {myfd}>>(worker) | |
((nr_children++)) | |
RSYNCFD[$i]=$myfd | |
RSYNCBYTES[$i]=0 | |
done | |
generate_file_list | balance | |
trap "sigchld_handler" CHLD | |
for i in {1..$PARALLEL}; do | |
myfd=$RSYNCFD[$i] | |
exec {myfd}>&- | |
done | |
zmodload zsh/zselect | |
echo "Waiting for workers to exit." >&2 | |
# TODO: properly test whether the main script can exit prematurely and leave workers running | |
while ((nr_children)) && [[ -n "$(echo $TMPDIR/*.pid(N))" ]]; do | |
zselect -t 100 | |
done | |
exit $GLOBAL_EXIT_STATUS |
Typo:
typeset -a findards
Change to:
typeset -a findargs
Quite. Thanks, fixed.
is there a way to randomize the file list? i have files spread across different drives alphabetically and in order to saturate a 10gb link, I need to transfer in a non alphabetic way.
is there a way to randomize the file list? i have files spread across different drives alphabetically and in order to saturate a 10gb link, I need to transfer in a non alphabetic way.
Not easily, no. Look at generate_file_list()
: it outputs "inode size path" tuples separated by NUL characters, and successive tuples are also separated by NULs (the only character that can't occur in a pathname). You'd need to rewrite this to randomize the order somehow but preserve the tuples.
If you can guarantee your filenames don't contain junk like spaces or newlines, it's easier.
I'm getting a syntax error on line 6.
./rsync_parallel.sh: 6: Syntax error: "(" unexpected
I'm getting a syntax error on line 6.
./rsync_parallel.sh: 6: Syntax error: "(" unexpected
You're probably trying to run it with a shell other than zsh.
As per previously mentioned, but more direct and facilitating non-modified use: On macOS we need to define the location of find
for the GNU variety via MacPorts or other package managers. Could you please insert a path definition so we can use non-standard path or named find
? for example, I'd like to see an added arg of something like --find_path=/opt/local/bin/gfind
.
Please take a look at: https://github.com/nathanhaigh/parallel-rsync/blob/main/prsync
Typo:
typeset -a findards
Change to:
typeset -a findargs