Skip to content

Instantly share code, notes, and snippets.

@allthingscode
Created January 28, 2011 21:47
Show Gist options
  • Save allthingscode/801053 to your computer and use it in GitHub Desktop.
Save allthingscode/801053 to your computer and use it in GitHub Desktop.
This script identifies duplicate files using their md5sum hash value.
#!/bin/bash
#
# lsdup.sh
#
# create a hash table and look up items
#
# This script identifies duplicate files using their md5sum hash value.
# This can be done much easier in a language that supports hash lists such as PERL. I have done it in a shell script as both a proof-of-concept
# and to demonstrate the basic concept of storing/retrieving hash values and dealing with collisions.
#
# There are several good things demonstrated in this script:
# * implementing return values in bash functions
# * looping through files that contain spaces
#
# - Matthew Hayes <[email protected]>
# =================================================================================================
shopt -s -o nounset # Option Explicit
declare -a TABLE # the hash table
declare -i TABLE_SIZE=65530 # size of the hash table
declare SEPARATOR="~" # delimiter for key/value
declare -r SCRIPT=${0##*/} # name of the script
declare -r MD5SUM='/usr/bin/md5sum' # used for hash generation
declare hash_of_file_RETURN # used for storing return value of hash_of_file
declare lookup_hash_RETURN # used for storing return value of lookup_hash
# =================================================================================================
function new_table_pos
{
declare HEXCODE # 4 hex digits of MD5 signature
declare -i CODE # the hex digits in decimal
HEXCODE=`echo "$1" | $MD5SUM` # compute the hash value (hex)
HEXCODE="${HEXCODE:0:4}" # take first 4 hex digits
CODE=`printf "%d\n" 0x"$HEXCODE"` # convert to 0 ... 65535
printf "%d\n" "$((CODE%TABLE_SIZE))" # restrict to table size
}
readonly -f new_table_pos
function hash_of_file
{
hash_of_file_RETURN=`$MD5SUM "$1"` # compute the hash value (hex)
hash_of_file_RETURN="${hash_of_file_RETURN:0:32}" # take first 32 hex digits (leaves off trailing garbage)
return 0
}
readonly -f hash_of_file
# add_to_hash: add a key/value pair to the lookup hash
# (does not check for uniqueness)
#
# parameter 1: the key
# parameter 2: the associated value
function add_to_hash
{
declare TEMP
declare -i ITEM=0
# Check the number of parameters
if [ $# -ne 2 ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: Expected two parameters" >&2
return 129
fi
# Make sure we have both a key and value
if [ -z "$1" ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: cannot use empty key" >&2
return 129
fi
if [ -z "$2" ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: cannot add empty value" >&2
return 129
fi
# Make sure the parameter is not in the key or value
TEMP="${1/$SEPARATOR/_}" # remove the separator (if any)
if [ "$1" != "$TEMP" ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: Key $1 must not contain $SEPARATOR" >&2
return 129
fi
TEMP="${2/$SEPARATOR/_}" # remove the separator (if any)
if [ "$2" != "$TEMP" ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: Value $2 must not contain $SEPARATOR" >&2
return 129
fi
ITEM=`new_table_pos "$1"`
shopt -u -o nounset # empty spots will cause error
while [ -n "${TABLE[$ITEM]}" ] ; do # free position yet
printf "%s\n" "Position $ITEM for $1 in use, moving forward ..."
let "ITEM=ITEM+1" # if not, keep looking
done
shopt -s -o nounset # safe to use now
TABLE[$ITEM]="$1""$SEPARATOR""$2" # add the pair to the table
#printf "%s\n" "Added $1 / $2 at $ITEM"
return 0
}
readonly -f add_to_hash
# --- lookup_hash: search for and display item matching key in the hash table ---
function lookup_hash
{
declare -i ITEM=0 # position in array
declare KEY
declare VALUE
# They key must not contain the separator
TEMP="${1/$SEPARATOR/_}" # remove the separator (if any)
if [ "$1" != "$TEMP" ] ; then
printf "%s\n" "$SCRIPT: $FUNCNAME: Key must not contain $SEPARATOR" >&2
return 129
fi
ITEM=`new_table_pos "$1"`
shopt -u -o nounset # empty spots will cause errors
while [ -n "${TABLE[$ITEM]}" ] ; do # reached an empty spot?
KEY="${TABLE[$ITEM]}" # if not, get the pair
KEY="${KEY%$SEPARATOR*}" # extract the key part
if [ "$KEY" = "$1" ] ; then # is it the key we want?
break; # good, it matches
else # otherwise
let "ITEM=ITEM+1" # move to the next item
fi
done
# Display search results
if [ -z "${TABLE[$ITEM]}" ] ; then
#printf "%s\n" "$1 is not in the table"
return 129
else
VALUE="${TABLE[$ITEM]}"
VALUE="${VALUE#*$SEPARATOR}"
lookup_hash_RETURN="$VALUE"
#printf "%s\n" "$1 has the value $VALUE"
return 0
fi
}
readonly -f lookup_hash
# Main script begins
if [ ! -x "$MD5SUM" ] ; then
printf "Unable to run command %s\n" "$MD5SUM" >&2
exit 129
fi
if [ $# -ne 1 ] ; then
printf "%s\n" "usage: $SCRIPT directory_path"
exit 129
fi
# Iterate regular files and escape spaces with '[SPACE]'
for ITEM in `find "$1" -type f | sed -e 's/ /[SPACE]/g'` ; do
# unescape spaces
declare FILEPATH
FILEPATH=`echo $ITEM | sed -e 's/\[SPACE\]/ /g'`
if [ -f "$FILEPATH" ] ; then
#declare FILENAME
declare HEXCODE
hash_of_file "$FILEPATH"
HEXCODE="$hash_of_file_RETURN"
# Get the text after the last '/'
# which is just the file name.
#FILENAME=${FILEPATH##*/}
#printf "The file name is: %s\n" "$FILENAME"
lookup_hash "$HEXCODE"
if [ $? -eq 0 ] ; then
printf "\n\nDuplicate for %s: \n%s\n%s\n\n" "$HEXCODE" "$lookup_hash_RETURN" "$FILEPATH"
else
add_to_hash "$HEXCODE" "$FILEPATH"
if [ $? -ne 0 ] ; then
printf "%s\n" "Unable to add item to table: $FILEPATH"
exit 129
fi
fi
fi
done
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment