Created
March 25, 2022 14:03
-
-
Save louiszuckerman/cb79b16ed938fbbc7e552f99abafc67c to your computer and use it in GitHub Desktop.
Bash script to prune duplicate files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Delete files from src_folder that are present in target_folder | |
# USAGE: dedupe.sh <src_folder> <target_folder> | |
# Uncomment for Linux | |
SHA_CMD=sha1sum | |
# Uncomment for Mac | |
# SHA_CMD=shasum | |
# First build a static list of files in the target directory for fast searching | |
TMP_LIST="$(echo "$2" | $SHA_CMD | cut -f 1 -d \ ).tmp.list" | |
find "$2" -type f > $TMP_LIST | |
# Iterate through files in the source directory | |
find "$1" -type f | while read i; do | |
# Get a SHA for the source file | |
SRC_SUM="$($SHA_CMD "$i" | cut -f 1 -d \ )" | |
# Check if there is a file in the target directory with the same name | |
SRC_NAME="$(basename "$i")" | |
TGT_PATH="$(grep "$SRC_NAME"'$' $TMP_LIST | head -1)" | |
# Get a SHA for the target file (or empty string if there isn't one) | |
TGT_SUM="$($SHA_CMD "$TGT_PATH" | cut -f 1 -d \ )" | |
# Compare source & target SHAs | |
if [[ $SRC_SUM == $TGT_SUM ]]; then | |
echo DELETING DUPLICATE: $i | |
rm "$i" | |
else | |
echo SKIPPING UNIQUE: $i | |
fi | |
done | |
rm $TMP_LIST |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment