matt-morris · November 27, 2016 06:09
diff --git a/renameToHash.sh b/renameToHash.sh
 #!/bin/bash
 # TODO: skip tiny files (so small they couldn't be photos)
 # TODO: make sure sym links and other file system oddities are handled

 #
 # Constants
 #
 CHAR_COUNT=12
 BLOCK_COUNT=6
 SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
 COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
 DEFAULT_PATTERN=".*\.(jpg|png|gif|mov|avi|mkv)$"

 #
 # Parameters
 #
 if [ -z "$1" ]
 then
  PATTERN="$DEFAULT_PATTERN"
 else
  PATTERN=$1
 fi

 #
 # Introduction
 #
 echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes"
 echo "The first $CHAR_COUNT chars of this hash are used to rename the file"
 echo ""

 #
 # Get list and count of files. Confirm with user if we should proceed
 #
 files=$(find . -maxdepth 1 -type f | egrep -i $PATTERN)
 count=$(echo "$files" | wc -l | sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
 echo "Found $count files that match the pattern $PATTERN"
 read -p "Rename all? <Y/n> " prompt
 if [[ $prompt == "n" || $prompt == "N" || $prompt == "NO" || $prompt == "no" ]]
 then
  exit 0
 fi
 echo ""

 #
 # For every file, compute a hash and rename
 #
 IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces
 for f in $files
 do

  # Hash the full file
  if [ COMPUTE_FULL_HASH = true ] ; then
    hash=$(md5 -q $f)

  # Hash an assortment of bytes
  else
    # Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies.
    # bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null)

    # Skip along the file, sampling bytes as we go
    bytes=""
    for(( i=1; i<=$BLOCK_COUNT; ++i )) do
      let BLOCK=$i*$SKIP_SIZE
      bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
    done
    hash=$(md5 <<< $bytes)
  fi

  shortHash=$(echo $hash | cut -c1-$CHAR_COUNT)
  ext=$(echo "$f" | sed 's/^.*\.//')
  originalNameWithoutPath="${f##*/}"
  # If you've already run this script on some of these files, we shouldn't duplicate them.
  if [[ $f == *"$shortHash"* ]]
  then
    echo "Skipping file. Name already contains the hash of its contents: $f"
    continue
  fi

  newName="$shortHash.$ext"
  
  # If a file with this name already exists, increment a number until it does not.
  # This is a likely duplicate, and the whole reason for running this script
  i=0
  while [ -f "$newName" ]; do
    let i=i+1
    newName="$shortHash ($i).$ext"
  done

  echo "$newName   <-   $f"
  mv $f $newName

 done
	#!/bin/bash
	# TODO: skip tiny files (so small they couldn't be photos)
	# TODO: make sure sym links and other file system oddities are handled

	#
	# Constants
	#
	CHAR_COUNT=12
	BLOCK_COUNT=6
	SKIP_SIZE=3 # Every new block is sampled by skipping this amount of blocks to the next position
	COMPUTE_FULL_HASH=false # Set `true` to trade processing speed for fewer false positives
	DEFAULT_PATTERN=".*\.(jpg\|png\|gif\|mov\|avi\|mkv)$"

	#
	# Parameters
	#
	if [ -z "$1" ]
	then
	PATTERN="$DEFAULT_PATTERN"
	else
	PATTERN=$1
	fi

	#
	# Introduction
	#
	echo "This script will get the hash of $BLOCK_COUNT 512 byte blocks for each file it processes"
	echo "The first $CHAR_COUNT chars of this hash are used to rename the file"
	echo ""

	#
	# Get list and count of files. Confirm with user if we should proceed
	#
	files=$(find . -maxdepth 1 -type f \| egrep -i $PATTERN)
	count=$(echo "$files" \| wc -l \| sed 's/^ *//') # The `sed` at the end removes whitespace from wc output
	echo "Found $count files that match the pattern $PATTERN"
	read -p "Rename all? <Y/n> " prompt
	if [[ $prompt == "n" \|\| $prompt == "N" \|\| $prompt == "NO" \|\| $prompt == "no" ]]
	then
	exit 0
	fi
	echo ""

	#
	# For every file, compute a hash and rename
	#
	IFS=$'\n' # make newlines the only iteration separator: http://askubuntu.com/questions/344407/how-to-read-complete-line-in-for-loop-with-spaces
	for f in $files
	do

	# Hash the full file
	if [ COMPUTE_FULL_HASH = true ] ; then
	hash=$(md5 -q $f)

	# Hash an assortment of bytes
	else
	# Naiive: Just grab a continguous chunk of N blocks. But this could be all empty space or all metadata. Too many false positivies.
	# bytes=$(dd if="$f" bs=512 count=$BLOCK_COUNT skip=$SKIP_START_BLOCKS 2> /dev/null)

	# Skip along the file, sampling bytes as we go
	bytes=""
	for(( i=1; i<=$BLOCK_COUNT; ++i )) do
	let BLOCK=$i*$SKIP_SIZE
	bytes+=$(dd if="$f" bs=512 count=1 skip=$BLOCK 2> /dev/null)
	done
	hash=$(md5 <<< $bytes)
	fi

	shortHash=$(echo $hash \| cut -c1-$CHAR_COUNT)
	ext=$(echo "$f" \| sed 's/^.*\.//')
	originalNameWithoutPath="${f##*/}"
	# If you've already run this script on some of these files, we shouldn't duplicate them.
	if [[ $f == "$shortHash" ]]
	then
	echo "Skipping file. Name already contains the hash of its contents: $f"
	continue
	fi

	newName="$shortHash.$ext"

	# If a file with this name already exists, increment a number until it does not.
	# This is a likely duplicate, and the whole reason for running this script
	i=0
	while [ -f "$newName" ]; do
	let i=i+1
	newName="$shortHash ($i).$ext"
	done

	echo "$newName <- $f"
	mv $f $newName

	done