-
-
Save matt-morris/429ed18c223373ac342701b02b410c8d to your computer and use it in GitHub Desktop.
Rename files with a hash based on their contents. eg: `abc.jpg` to `3101ace8db9f.jpg`. Useful for detecting duplicates.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# TODO: skip tiny files (so small they couldn't be photos) | |
# TODO: make sure sym links and other file system oddities are handled | |
# | |
# Constants | |
# | |
CHAR_COUNT=12 | |
BLOCK_COUNT=6 | |
SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position | |
COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives | |
DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$" | |
# | |
# Parameters | |
# | |
if [ -z "$1" ] | |
then | |
PATTERN="$DEFAULT_PATTERN" | |
else | |
PATTERN=$1 | |
fi | |
# | |
# Introduction | |
# | |
echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes" | |
echo "The first $CHAR_COUNT chars of this hash are used to rename the file" | |
echo "" | |
# | |
# Get list and count of files. Confirm with user if we should proceed | |
# | |
files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN) | |
count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output | |
echo "Found $count files that match the pattern $PATTERN" | |
read -p "Rename all? <Y/n> " prompt | |
if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]] | |
then | |
exit 0 | |
fi | |
echo "" | |
# | |
# For every file, compute a hash and rename | |
# | |
IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces | |
for f in $files | |
do | |
# Hash the full file | |
if [ COMPUTE_FULL_HASH = true ] ; then | |
hash=$(md5 -q $f) | |
# Hash an assortment of bytes | |
else | |
# Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies. | |
# bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null) | |
# Skip along the file, sampling bytes as we go | |
bytes="" | |
for(( i=1; i<=$BLOCK_COUNT; ++i )) do | |
let BLOCK=$i*$SKIP_SIZE | |
bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null) | |
done | |
hash=$(md5 <<< $bytes) | |
fi | |
shortHash=$(echo $hash | cut -c1-$CHAR_COUNT) | |
ext=$(echo "$f" | sed 's/^.*\.//') | |
originalNameWithoutPath="${f##*/}" | |
# If you've already run this script on some of these files, we shouldn't duplicate them. | |
if [[ $f == *"$shortHash"* ]] | |
then | |
echo "Skipping file. Name already contains the hash of its contents: $f" | |
continue | |
fi | |
newName="$shortHash.$ext" | |
# If a file with this name already exists, increment a number until it does not. | |
# This is a likely duplicate, and the whole reason for running this script | |
i=0 | |
while [ -f "$newName" ]; do | |
let i=i+1 | |
newName="$shortHash ($i).$ext" | |
done | |
echo "$newName <- $f" | |
mv $f $newName | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment