-
-
Save apirogov/c2253daf105d813349c6ae471e97a2d7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
Fix moved/renamed files before running rsync to avoid useless retransmissions. | |
Copyright (C) 2022 Anton Pirogov, licensed under the MIT License | |
""" | |
from itertools import chain | |
from typing import List, Dict, Optional | |
from dataclasses import dataclass | |
from pathlib import Path | |
from datetime import datetime | |
from shlex import quote | |
import argparse | |
import subprocess | |
HASHSUM_TOOL = "sha256sum" # NOTE: must be available on both SRC and DEST host | |
verbose: bool = False | |
def arg_parser(): | |
parser = argparse.ArgumentParser(prog=Path(__file__).name, description=__doc__) | |
parser.add_argument("-v", "--verbose", action="store_true", default=False) | |
parser.add_argument("-s", "--script", action="store_true", default=False, | |
help="Just output script executing detected moves ('dry run')") | |
parser.add_argument("-e", "--rsh", default="ssh", | |
help="rsync remote shell command to use") | |
parser.add_argument("-f", "--filter-args", default="", | |
help="rsync args affecting file list") | |
parser.add_argument("SRC") | |
parser.add_argument("DEST") | |
return parser | |
def main(): | |
global verbose | |
args = arg_parser().parse_args() | |
verbose = args.verbose | |
assert not args.SRC.endswith("/"), "SRC must not have trailing slash!" | |
assert not args.DEST.endswith("/"), "DEST must not have trailing slash!" | |
if verbose: | |
print("---- find moved/renamed files: ----") | |
moves = compute_moves(args) | |
if verbose: | |
print("-------- required actions: --------") | |
for cmd, src, trg in moves: | |
print(cmd, quote(src), quote(trg)) | |
print("-------- perform actions: --------") | |
for move in moves: | |
cmd = dst_cmd(args, *move) | |
print(cmd) # always show cmd! if something goes wrong, it helps debugging | |
if not args.script: | |
run(cmd) | |
if verbose: | |
print("-----------------------------------") | |
# ---- | |
@dataclass | |
class FileInfo: | |
# parsed from output | |
filename: str | |
permissions: str | |
size: int | |
timestamp: datetime | |
# for candidates: | |
# additionally computed hashsum, or the size if it is unique in file set | |
hashsum: Optional[str] | |
@property | |
def is_dir(self): | |
return self.permissions[0] == "d" | |
def run(cmd, check=True) -> bytes: | |
"""Run command and return standard output.""" | |
if verbose: | |
print(f"run: {cmd}") | |
return subprocess.run(cmd, shell=True, check=check, capture_output=True).stdout | |
def rsync_list_cmd(args, dir: str) -> str: | |
"""Command to list rsync candidate files.""" | |
return f"rsync {args.filter_args} -e '{args.rsh}' --list-only {dir}" | |
def rsync_list(args, dir) -> Dict[str, FileInfo]: | |
"""Return FileInfo objects for each rsync candidate.""" | |
output = run(rsync_list_cmd(args, dir)) | |
lines = output.decode("utf-8").splitlines() | |
parsed = [l.split(maxsplit=4) for l in lines] | |
ret = {} | |
for permissions, size, date, time, path in parsed: | |
sz = int(size.replace(".", "")) | |
timestamp = datetime.strptime(f"{date} {time}", '%Y/%m/%d %H:%M:%S') | |
ret[path] = FileInfo(Path(path).name, permissions, sz, timestamp, hashsum=None) | |
return ret | |
def hashsum(args, path, is_src: bool=True) -> Optional[str]: | |
"""Compute hashsum of a file (at source or target).""" | |
root = args.SRC if is_src else args.DEST | |
is_local = root.split("/")[0].find(":") < 0 | |
if is_local: | |
cmd_pref = "" | |
filepath = str(Path(root).parent / path) | |
else: | |
remote_host, remote_dir = root.split(":") | |
cmd_pref = f"{args.rsh} {remote_host} " | |
filepath = f"{remote_dir}/{path}" | |
cmd = f"{HASHSUM_TOOL} {quote(filepath)}" | |
if not is_local: | |
cmd = f"{cmd_pref} {quote(cmd)}" | |
ret = run(cmd, check=False) | |
if not ret: | |
return None | |
else: | |
return ret.decode("utf-8").split()[0] | |
def dst_cmd(args, cmd, src_path, trg_path) -> str: | |
root = args.DEST | |
is_local = root.split("/")[0].find(":") < 0 | |
remote_host, remote_dir = (None, None) | |
if not is_local: | |
remote_host, remote_dir = root.split(":") | |
def wrap_cmd(cmd): | |
if is_local: | |
return cmd | |
else: | |
cmd_pref = f"{args.rsh} {remote_host} " | |
return f"{cmd_pref} {quote(cmd)}" | |
def fullpath(path): | |
if is_local: | |
return str(Path(root).parent / path) | |
else: | |
return f"{remote_dir}/{path}" | |
src, trg = fullpath(src_path), fullpath(trg_path) | |
trgdir = "/".join(trg.split("/")[:-1]) or "." | |
return wrap_cmd(f"mkdir -p {quote(trgdir)} && {cmd} -n {quote(src)} {quote(trg)}") | |
# ---- | |
def keep_with_hashsum(info_dict): | |
return { k: v for k, v in info_dict.items() if v.hashsum } | |
def classify_paths(src_files, dst_files): | |
added, removed, modified = set(), set(), set() | |
for path in set.union(set(src_files.keys()), set(dst_files.keys())): | |
in_src, in_trg = path in src_files, path in dst_files | |
if in_src and not in_trg: | |
added.add(path) | |
elif in_trg and not in_src: | |
removed.add(path) | |
elif in_src and in_trg: | |
src_file, dst_file = src_files[path], dst_files[path] | |
# if file -> dir / dir -> file, this entity was modified | |
diff_dirfile = src_file.is_dir != dst_file.is_dir | |
# if size is different, it could still be e.g. part of linear/circular move | |
# for bigger files, probability of different "real" files to be same size | |
# is very small -> almost as good as hashsum to detect difference | |
# for small files does not matter, they can be trasferred fast anyway | |
diff_size = src_file.size != dst_file.size | |
# different hashsum -> different file with very high likelihood | |
both_hsum = src_file.hashsum and dst_file.hashsum | |
diff_hsum = both_hsum and src_file.hashsum != dst_file.hashsum | |
if diff_size or diff_dirfile or diff_hsum: | |
modified.add(path) | |
return (added, modified, removed) | |
def hash_to_path_dict(fileinfos: Dict[str, FileInfo]) -> Dict[str, List[str]]: | |
ret = {} | |
for p, i in fileinfos.items(): | |
if i.hashsum is None: | |
continue | |
if i.hashsum not in ret: | |
ret[i.hashsum] = [] | |
ret[i.hashsum].append(p) | |
return ret | |
def resolve_moves(src_files, dst_files, added, modified, removed): | |
MV_CMD, CP_CMD = "mv", "cp" | |
# reverse lookup of paths by hashsum | |
h_to_src_file = hash_to_path_dict(src_files) | |
h_to_trg_file = hash_to_path_dict(dst_files) | |
# get hashsums of files that were possibly moved: | |
# hashsums of new or mod paths in source | |
src_hsums = {src_files[p].hashsum for p in set.union(added, modified)} | |
# hashsums of removed or mod paths in target | |
trg_hsums = {dst_files[p].hashsum for p in set.union(removed, modified)} | |
# these were moved (hashsum with different paths, exist in target): | |
moved_hsums = set.intersection(src_hsums, trg_hsums) | |
# compute moves to perform in target | |
ret_direct, ret_pre, ret_post = [], [], [] | |
for h in moved_hsums: | |
src_path = h_to_trg_file[h][0] # any file with that hashsum is ok | |
trg_path, copies = h_to_src_file[h][0], h_to_src_file[h][1:] | |
if trg_path not in dst_files: | |
ret_direct.append([MV_CMD, src_path, trg_path]) | |
elif trg_path in src_files: | |
ret_pre.append([MV_CMD, src_path, trg_path + "_tmp"]) | |
ret_post.append([MV_CMD, trg_path + "_tmp", trg_path]) | |
for trg_copy in copies: | |
if trg_copy in dst_files: | |
ret_pre.append([MV_CMD, trg_copy, trg_copy + "_old"]) | |
ret_post.append([CP_CMD, trg_path, trg_copy]) | |
# return sequence of shell commands in correct order | |
return ret_direct + ret_pre + ret_post | |
def compute_moves(args): | |
# get list of candidates both locally and on remote | |
DST = f"{args.DEST}/{args.SRC.split('/')[-1]}" | |
src_files, dst_files = rsync_list(args, args.SRC), rsync_list(args, DST) | |
# figure out which file size is unique | |
sizes = {} # size -> filename | |
for path, info in chain(src_files.items(), dst_files.items()): | |
if info.size not in sizes: | |
sizes[info.size] = set() | |
sizes[info.size].add(path) | |
# use file size as hashsum replacement if it is unique | |
unique_sizes = { size for size, names in sizes.items() if len(names)<2 } | |
for path, info in chain(src_files.items(), dst_files.items()): | |
if info.size in unique_sizes: | |
info.hashsum = str(info.size) | |
# get hashsums for files that might have been moved (will compute hashsum for these) | |
# these are files that exist only in src/dest or that look modified (diff. size) | |
c_add, c_mod, c_rem = classify_paths(src_files, dst_files) | |
for path in set.union(c_add, c_mod): | |
src = src_files[path] | |
if src.is_dir or src.hashsum: | |
print(f"{path}: skip (dir or unique file size)") | |
continue | |
src.hashsum = hashsum(args, path, is_src=True) | |
for path in set.union(c_rem, c_mod): | |
dst = dst_files[path] | |
if dst.is_dir or dst.hashsum: | |
print(f"{path}: skip (dir or unique file size)") | |
continue | |
dst.hashsum = hashsum(args, path, is_src=False) | |
# now do re-classification for remaining candidates with hashsum | |
src_files, dst_files = keep_with_hashsum(src_files), keep_with_hashsum(dst_files) | |
add, mod, rem = classify_paths(src_files, dst_files) | |
return resolve_moves(src_files, dst_files, add, mod, rem) | |
# ---- | |
if __name__ == "__main__": | |
main() |
This is useful, but there is a production-ready tool built exactly to minimize "useless retransmissions": BorgBackup. It even works on a sub-file level! You can move, rename, AND change only 3 bytes in a 10GB file and Borg will only copy the one chunk that changed and nothing else. There's nothing better. 🤩
My use-case for this is simply syncing the contents of my laptop hard-drive to an external harddrive connected to a raspberry pi in my home network, which I do sporadically and run it by hand. I don't want to overengineer a complex personal backup system I become dependent on and have to maintain or fix when something does not work, I want something dead-simple here.
If you need serious backup with a way to revert to past old states, sure - other tools are definitely better suited.
For everyone else, also see this discussion.
@alexandervlpl I was under the impression that borg works with repositories, so that you need to mount them to acess the files, whereas rsync simply syncs files, readily accessible
Why do I keep seeing this argument floated on every discussion I've read for how to make rsync better handle file moves? Are Borg Backup that desperate for sales that they target folks creating scripts to assist with rsync use cases?
I believe in other conversations it is mentioned that BorgBackup is very much NOT suited to these sort of scenarios as it is a backup tool that creates images that you need to use FUSE to mount in order to actually copy files between systems as opposed to create some sort of backup image.
This is useful, but there is a production-ready tool built exactly to minimize "useless retransmissions": BorgBackup. It even works on a sub-file level! You can move, rename, AND change only 3 bytes in a 10GB file and Borg will only copy the one chunk that changed and nothing else. There's nothing better. 🤩
For me the code is difficult, so I added a new test to make sure.
https://github.com/openandclose/rsync-prelude/tree/myfork