-
-
Save v9n/0ff62dbf97098cbc00c9e267c82216a8 to your computer and use it in GitHub Desktop.
Rename files with a hash based on their contents. eg: `abc.jpg` to `3101ace8db9f.jpg`. Useful for detecting duplicates.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# TODO: skip tiny files (so small they couldn't be photos) | |
# TODO: make sure sym links and other file system oddities are handled | |
# | |
# Constants | |
# | |
CHAR_COUNT=12 | |
BLOCK_COUNT=3 | |
COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives | |
DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$" | |
# | |
# Parameters | |
# | |
if [ -z "$1" ] | |
then | |
PATTERN="$DEFAULT_PATTERN" | |
else | |
PATTERN=$1 | |
fi | |
# | |
# Introduction | |
# | |
echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes" | |
echo "The first $CHAR_COUNT chars of this hash are used to rename the file" | |
echo "" | |
# | |
# Get list and count of files. Confirm with user if we should proceed | |
# | |
files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN) | |
count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output | |
echo "Found $count files that match the pattern $PATTERN" | |
read -p "Rename all? <Y/n> " prompt | |
if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]] | |
then | |
exit 0 | |
fi | |
echo "" | |
# | |
# For every file, compute a hash and rename | |
# | |
IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces | |
for f in $files | |
do | |
if [ COMPUTE_FULL_HASH = true ] ; then | |
hash=$(md5 -q $f) | |
else | |
firstPartOfFile=$(dd if="$f" bs=512 count=$BLOCK_COUNT 2> /dev/null) | |
hash=$(md5 <<< $firstPartOfFile) | |
fi | |
shortHash=$(echo $hash | cut -c1-$CHAR_COUNT) | |
ext=$(echo "$f" | sed 's/^.*\.//') | |
originalNameWithoutPath="${f##*/}" | |
# If you've already run this script on some of these files, we shouldn't duplicate them. | |
if [[ $f == *"$shortHash"* ]] | |
then | |
echo "Skipping file. Name already contains the hash of its contents: $f" | |
continue | |
fi | |
newName="$shortHash.$ext" | |
# If a file with this name already exists, increment a number until it does not. | |
# This is a likely duplicate, and the whole reason for running this script | |
i=0 | |
while [ -f "$newName" ]; do | |
let i=i+1 | |
newName="$shortHash ($i).$ext" | |
done | |
echo "$newName <- $f" | |
mv $f $newName | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment