-
-
Save akorn/644855ddaa8065f564be to your computer and use it in GitHub Desktop.
#!/bin/zsh | |
# | |
# Copyright (c) 2014, 2020 by Dr. András Korn. Implements the basic idea of a similar script by Robert Coup (2013). | |
# License: GPLv3 | |
function usage() { | |
echo 'Usage: | |
rsync_parallel [--parallel=N] <args to find(1) to generate list of stuff to transfer> -- <args to rsync> | |
Options: | |
--parallel=N Use N parallel processes for transfer. Defaults to $(nproc) if nproc is available; otherwise to 10. | |
Notes: | |
* Should properly handle filenames with embedded newlines. | |
* Use with key based SSH authentication to avoid repeated password prompts. | |
* Unfortunately, the only way to handle funny filenames involves | |
resorting to find(1), so rsync_parallel is not a drop-in replacement | |
for rsync(1). It will call rsync(1) with -0 --files-from=-, and feed it | |
the list of files found by find based on the find(1) arguments you gave | |
on the command line. You need to make sure the paths output by find will | |
be valid relative to the source directory you pass to rsync. | |
* Depends on find -printf, so probably GNU find(1). | |
* Exit status is the highest of all child rsync exit statuses, or 111 if | |
invoked incorrectly, or 127 if at least one of the workers aborted with | |
an unkown exit status. | |
Example: | |
rsync_parallel --parallel=42 . -- -avHPSAX . user@remote:/some/path/. | |
' | |
} | |
typeset -a RSYNCBYTES # an array to count the number of bytes each rsync child has been requested to transfer | |
typeset -a RSYNCFD # an array whose members are file descriptors connected to workers' stdins | |
typeset -a findargs # we'll parse find(1) arguments into this array | |
typeset -a rsyncargs # and rsync(1) arguments into this one | |
typeset -A STATUS_REPORTED # a hash to keep track of which workers' status we already printed | |
typeset -A inode_worker # a hash that keeps track of which worker we assigned which inode to; needed to allow rsync -H to work | |
typeset -a WORKER_STATUS | |
nr_children=0 | |
GLOBAL_EXIT_STATUS=0 | |
hardlinks=0 # set to 1 if rsync args apparently include -H or --hardlinks | |
TMPDIR=$(mktemp -d) || { echo "FATAL: unable to create temporary directory." >&2; exit 111 } | |
trap "rm -rf $TMPDIR" EXIT | |
# The only way to obtain the exit statuses from the rsync processes is to write them into tempfiles :( | |
function worker() { | |
local ret | |
trap 'rm $TMPDIR/worker${i}.pid' EXIT | |
echo $$ >$TMPDIR/worker${i}.pid | |
rsync -0 --files-from=- $rsyncargs | |
ret=$? | |
echo $ret >$TMPDIR/worker${i}.status | |
} | |
# The file list we'll obtain below will be piped into this load-balancing | |
# function that chooses which rsync child to pass the incoming filename to. | |
# It chooses the one with the fewest bytes allocated to it so far. | |
function balance() { | |
trap - EXIT | |
local min minworker | |
local IFS="" | |
while read -rd '' inum; do | |
read -rd '' size | |
read -rd '' name | |
min=${${(n)RSYNCBYTES}[1]} | |
minworker=${RSYNCBYTES[(I)$min]} | |
if ((hardlinks)); then | |
if [[ -n "$inode_worker[$inum]" ]]; then | |
minworker=$inode_worker[$inum] | |
else | |
inode_worker[$inum]=$minworker | |
fi | |
fi | |
print -rN -u $RSYNCFD[$minworker] "$name" | |
((RSYNCBYTES[$minworker]+=$size)) | |
done | |
} | |
# Obtain file list ("length filename" tuples, one per line). | |
# It would be tempting to use rsync itself for this, with --no-v --dry-run and | |
# an out-format of "%l %n", but rsync will escape some characters in filenames | |
# and not recognize the same escapes in --files-from; so we need to use | |
# find(1). This has the drawback of also printing filenames that will be | |
# excluded from the transfer using --exclude. | |
function generate_file_list() { | |
trap - EXIT | |
find $findargs -printf "%i\0%s\0%p\0" | |
} | |
function sigchld_handler() { | |
trap - EXIT | |
((nr_children--)) | |
echo "INFO: a worker exited; $nr_children still running." >&2 | |
local found=0 | |
for i in {1..$PARALLEL}; do | |
((STATUS_REPORTED[$i])) && continue | |
if ! [[ -e $TMPDIR/worker${i}.pid ]]; then | |
found=1 | |
if [[ -r $TMPDIR/worker{$i}.status ]]; then | |
WORKER_STATUS[$i]=$(<$TMPDIR/worker${i}.status) | |
((WORKER_STATUS[$i])) && echo "ERROR: worker $i exited with error $WORKER_STATUS[$i]." >&2 | |
else | |
WORKER_STATUS[$i]=127 | |
echo "ERROR: worker $i exited unexpectedly/abnormally; assuming exit status 127." >&2 | |
fi | |
[[ $WORKER_STATUS[$i] -gt $GLOBAL_EXIT_STATUS ]] && GLOBAL_EXIT_STATUS=$WORKER_STATUS[$i] | |
STATUS_REPORTED[$i]=1 | |
continue | |
fi | |
done | |
if ! ((found)); then | |
echo "WARNING: stray SIGCHLD; apparently a worker exited but I don't know which. Global exit status could be wrong. $(echo $TMPDIR/*)" >&2 | |
fi | |
} | |
if [[ "$1" == --parallel=* ]]; then | |
PARALLEL="${1##*=}"; shift | |
elif [[ -x /usr/bin/nproc ]]; then | |
PARALLEL=$(nproc) | |
else | |
PARALLEL=10 | |
fi | |
# get findargs | |
while [[ -n $1 ]] && ! [[ $1 = -- ]]; do | |
findargs=($findargs $1) | |
shift | |
done | |
[[ $1 = -- ]] && shift | |
# anything left over is args for rsync | |
while [[ -n $1 ]]; do | |
{ [[ $1 == -H ]] || [[ $1 == -[^-]*H* ]] || [[ $1 == --hard-links ]] } && hardlinks=1 | |
# This is imperfect because "-*H*" can occur in a path specification, | |
# but it fails safely. I don't want to reimplement much of the rsync | |
# option parser just to catch this corner case. False positive | |
# detection of --hard-links results in higher memory consumption for | |
# the script, and possibly reduced parallelism if the same | |
# inode number occurs on different files (on different filesystems) | |
# being transferred. | |
[[ $1 == --no-hard-links ]] && hardlinks=0 | |
# Again, this is imperfect because if we're already specifying paths, | |
# a request to transfer a directory called --no-hard-links would | |
# cause the hardlink logic to be disabled. If you have such | |
# pathological filenames, change the script. | |
rsyncargs=($rsyncargs $1) | |
shift | |
done | |
# You didn't specify any args for rsync? Probably not what you meant. | |
[[ -z $rsyncargs ]] && usage && exit 111 | |
echo "INFO: Using up to $PARALLEL processes for transfer." >&2 | |
# spawn rsync children, each reading the list of files it should transfer from stdin. | |
for i in {1..$PARALLEL}; do | |
exec {myfd}>>(worker) | |
((nr_children++)) | |
RSYNCFD[$i]=$myfd | |
RSYNCBYTES[$i]=0 | |
done | |
generate_file_list | balance | |
trap "sigchld_handler" CHLD | |
for i in {1..$PARALLEL}; do | |
myfd=$RSYNCFD[$i] | |
exec {myfd}>&- | |
done | |
zmodload zsh/zselect | |
echo "Waiting for workers to exit." >&2 | |
# TODO: properly test whether the main script can exit prematurely and leave workers running | |
while ((nr_children)) && [[ -n "$(echo $TMPDIR/*.pid(N))" ]]; do | |
zselect -t 100 | |
done | |
exit $GLOBAL_EXIT_STATUS |
Yes. The script can't easily figure out what part of the rsync command line is a source argument, what is a destination argument and what is a switch (it'd need to reimplement part of the rsync command line parser), so we're left with specfying the same source(s) twice: once for find
, once for rsync
.
If you only want to transfer one subtree, then switching to it before starting the transfer and specifying the source as .
also avoids the duplication.
Right, so just to make sure I'm using this properly, I tried the following:
$ cd /path/to/transfer
$ rsync_parallel.sh --parallel=3 . -- -av --stats --progress . "/Volumes/dest_volume/dest_folder/"
And unfortunately it didn't sync anything, having the following output (there are several hundred GB to transfer in actuality):
INFO: Using up to 3 processes for transfer.
find: -printf: unknown primary or operator
Waiting for workers to exit.
building file list ... 0 files to consider
building file list ... building file list ... 0 files to consider
0 files to consider
Number of files: 0
Number of files: 0
Number of created files: 0
Number of created files: 0
Number of deleted files: 0
Number of deleted files: 0
Number of regular files transferred: 0
Number of regular files transferred: 0
Total file size: 0 bytes
Total file size: 0 bytes
Total transferred file size: 0 bytes
Total transferred file size: 0 bytes
Literal data: 0 bytes
Literal data: 0 bytes
Matched data: 0 bytes
Matched data: 0 bytes
File list size: 0
File list size: 0
File list generation time: 0.001 seconds
File list generation time: 0.001 seconds
File list transfer time: 0.000 seconds
File list transfer time: 0.000 seconds
Total bytes sent: 18
Total bytes sent: 18
Total bytes received: 12
Total bytes received: 12
sent 18 bytes received 12 bytes 60.00 bytes/sec
sent 18 bytes received 12 bytes 60.00 bytes/sec
total size is 0 speedup is 0.00
total size is 0 speedup is 0.00
Number of files: 0
Number of created files: 0
Number of deleted files: 0
Number of regular files transferred: 0
Total file size: 0 bytes
Total transferred file size: 0 bytes
Literal data: 0 bytes
Matched data: 0 bytes
File list size: 0
File list generation time: 0.001 seconds
File list transfer time: 0.000 seconds
Total bytes sent: 18
Total bytes received: 12
sent 18 bytes received 12 bytes 60.00 bytes/sec
total size is 0 speedup is 0.00
Well, apparently your find(1)
doesn't do -printf
, which is bad, because the script relies on it in a big way (it is used to print a NUL-separated list of NUL-separated {inum;size;name} tuples).
If you can't install GNU findutils, you'll need to find a different way of producing such output in generate_file_list()
.
If your find
supports -print0
, you could use that and add stat
calls in balance()
to get the inode number and size. This would slow down file list generation by a factor of ~2. I'd recommend zmodload zsh/stat
so you at least don't need to fork()
for stat
. Or, if you care neither about exact balancing nor preserving hardlinks, you could remove the size and inum related logic completely and just use -print0
in the find
call, giving each new found file to the next worker, round-robin fashion.
Hmm, your'e right. FreeBSD & macOS work a little differently with default find, (eg, find . -print0 | xargs -0 stat -f '%i '
). I do have /opt/local/bin/gfind
which is the GNU find. I suppose we'll need to add some logic that will use gfind
if found in path or something of that nature. Hmm.
So yes, that did the trick and it's working away now (just using gfind
in your script instead of find
). One thing I'm concerned about is I don't see parallel transfers happening right now - still only getting rsync executing one at a time instead of X instances in parallel...
Appears that when I ran the "production" version I tried a higher parallel number of 40 and this was the trouble. It cause some serious issues with zombie processes and required a reboot when I cancelled it. It was also running somewhat slowly even on a system with 128G RAM and 24 cores. When I went with a lower number like 5 the transfer rate actually shot up and things are seemingly working relatively well...
A zombie process is just an entry in the process table that sticks around until its parent reads its exit status. Hardly a reason to reboot. :)
Typo:
typeset -a findards
Change to:
typeset -a findargs
Typo:
typeset -a findards
Change to:
typeset -a findargs
Quite. Thanks, fixed.
is there a way to randomize the file list? i have files spread across different drives alphabetically and in order to saturate a 10gb link, I need to transfer in a non alphabetic way.
is there a way to randomize the file list? i have files spread across different drives alphabetically and in order to saturate a 10gb link, I need to transfer in a non alphabetic way.
Not easily, no. Look at generate_file_list()
: it outputs "inode size path" tuples separated by NUL characters, and successive tuples are also separated by NULs (the only character that can't occur in a pathname). You'd need to rewrite this to randomize the order somehow but preserve the tuples.
If you can guarantee your filenames don't contain junk like spaces or newlines, it's easier.
I'm getting a syntax error on line 6.
./rsync_parallel.sh: 6: Syntax error: "(" unexpected
I'm getting a syntax error on line 6.
./rsync_parallel.sh: 6: Syntax error: "(" unexpected
You're probably trying to run it with a shell other than zsh.
As per previously mentioned, but more direct and facilitating non-modified use: On macOS we need to define the location of find
for the GNU variety via MacPorts or other package managers. Could you please insert a path definition so we can use non-standard path or named find
? for example, I'd like to see an added arg of something like --find_path=/opt/local/bin/gfind
.
Please take a look at: https://github.com/nathanhaigh/parallel-rsync/blob/main/prsync
Thanks @akorn so just to be crystal clear here, to run this from a local drive to mounted SMB share, one would do the following with a duplication of the source path, correct?:
And with that being the case, it might be better to reduce duplication via: