Last active
August 29, 2024 07:32
-
-
Save YSaxon/346e668f8d7e8c29bfedb98cd31d182e to your computer and use it in GitHub Desktop.
Shell script to deduplicate in a directory with MacOS APFS cow copies using rdfind
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# Optionally use apfs-clonechecker to check for clones before recloning, useful if you are rerunning the script | |
export do_clone_check=${do_clone_check:-1} | |
export cleanup=${cleanup:-1} | |
export verbosity=${verbosity:-3} | |
export do_extra_checksum_test=${do_extra_checksum_test:-1} | |
export reuse_existing_results_txt=${reuse_existing_results_txt:-0} | |
export minsize=${minsize:-4096} | |
if [[ $reuse_existing_results_txt == 0 ]] && [[ -f results.txt ]]; then | |
echo "results.txt already exists. Please remove it and try again. Or export reuse_existing_results_txt=1 to reuse it." | |
exit 1 | |
fi | |
if [[ $reuse_existing_results_txt == 1 ]] && [[ ! -f results.txt ]]; then | |
echo "results.txt does not exist. Please unset reuse_existing_results_txt (or export to 0) and try again." | |
exit 1 | |
fi | |
if [[ $reuse_existing_results_txt == 1 ]] && [[ $do_extra_checksum_test == 0 ]]; then | |
echo "For safety, we cannot reuse an existing results.txt without repeating a checksum test. Please export do_extra_checksum_test=1 and try again." | |
exit 1 | |
fi | |
if [[ -f firsts ]] || [[ -f others ]]; then | |
echo "One or both of files 'firsts' or 'others' already exists. Please remove them and try again." | |
echo "Try: rm firsts others" | |
exit 1 | |
fi | |
if [[ $do_clone_check == 1 ]] && [[ ! -f /tmp/clone_checker ]]; then | |
curl -sSL https://raw.githubusercontent.com/dyorgio/apfs-clone-checker/be8d03c9a8c5d3996582d1adfb85d9a9b230d07f/clone_checker.c -o /tmp/clone_checker.c | |
if gcc /tmp/clone_checker.c -o /tmp/clone_checker; then | |
echo "Clone checker compiled successfully." | |
else | |
echo "Could not download and compile clone checker. Skipping APFS clone checking." | |
export do_clone_check=0 | |
fi | |
fi | |
if [[ $reuse_existing_results_txt == 0 ]]; then | |
if ! command -v rdfind >/dev/null 2>&1; then | |
echo "rdfind not found. Installing..." | |
if ! brew install rdfind; then | |
echo "Failed to find rdfind or install it with homebrew. Please install it manually and try again." | |
exit 1 | |
fi | |
fi | |
if [[ $verbosity -ge 2 ]]; then | |
rdfind -makeresultsfile true -minsize $minsize . | |
else | |
rdfind -makeresultsfile true -minsize $minsize . &>/dev/null | |
fi | |
[[ $verbosity -ge 2 ]] && echo "rdfind finished. Processing results..." | |
fi | |
declare -A file_ids | |
grep '^DUPTYPE' results.txt | while IFS= read -r line; do | |
id=$(cut -d ' ' -f 2 <<<$line | tr -d '-') | |
filepath="$(cut -d ' ' -f 8- <<< $line)" | |
# if [[ $reuse_existing_results_txt == 1 ]]; then | |
# # Retest the file size to see if it's more than minsize | |
# if [[ $(stat -f "%z" "$filepath") -lt $minsize ]]; then | |
# [[ $verbosity -ge 4 ]] && printf "Skipping %s because it's less than minsize %d\n" "$filepath" $minsize | |
# continue | |
# fi | |
# fi | |
if [[ -z $file_ids[$id] ]]; then | |
file_ids[$id]=$filepath | |
else | |
orig=$file_ids[$id] | |
dup=$filepath | |
if [[ $do_clone_check == 1 ]] && [[ $(/tmp/clone_checker $orig $dup) == 1 ]]; then | |
[[ $verbosity -ge 4 ]] && printf 'Skipping because they are already APFS CoW clones: %s -> %s\n' $orig $dup | |
continue | |
fi | |
if [[ $do_extra_checksum_test == 1 ]] && [[ $(sha1sum "$orig" | cut -d ' ' -f 1) != $(sha1sum "$dup" | cut -d ' ' -f 1) ]]; then | |
[[ $verbosity -ge 1 ]] && printf "Warning: %s and %s have different sha1sums. Skipping.\n" "$orig" "$dup" | |
continue | |
fi | |
if [[ ! -r $orig ]]; then | |
[[ $verbosity -ge 1 ]] && printf "Warning: source file %s is not readable. Skipping.\n" "$orig" | |
continue | |
fi | |
if [[ ! -O $dup || ! -G $dup ]]; then | |
[[ $verbosity -ge 1 ]] && printf "Warning: destination file %s is not owned by you, and ownership will end up permanently changed. Skipping.\n" "$dup" | |
continue | |
fi | |
orig_perm=$(stat -f "%Mp%Lp" $dup) | |
[[ $verbosity -ge 4 ]] && printf "Cloning %s to %s\n" "$orig" "$dup" | |
[[ $verbosity -ge 2 ]] && dashv="-v" || dashv="" | |
cp $dashv -c "$orig" "$dup" | |
if [[ $? -ne 0 ]]; then | |
[[ $verbosity -ge 1 ]] && printf "Error: cp failed to clone %s to %s\n" "$orig" "$dup" | |
continue | |
fi | |
if [[ $orig_perm != $(stat -f "%Mp%Lp" $dup) ]]; then | |
chmod $orig_perm $dup | |
if [[ $? -ne 0 ]]; then | |
printf "Error: chmod failed to restore permissions on %s\nPlease manually restore them to %s\n" "$dup" "$orig_perm" | |
exit 1 | |
else | |
[[ $verbosity -ge 2 ]] && printf "Restored permissions on %s to %s\n" "$dup" "$orig_perm" | |
fi | |
fi | |
fi | |
done | |
[[ $cleanup == 1 ]] && rm -f results.txt firsts others |
It should be robust to most spaces now
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I just realized this may not work properly if some of your files have spaces in them..
I can probably fix this by using a proper grep instead of cut, but until then, just beware of this issue