Created
January 28, 2011 21:47
-
-
Save allthingscode/801053 to your computer and use it in GitHub Desktop.
This script identifies duplicate files using their md5sum hash value.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# lsdup.sh | |
# | |
# create a hash table and look up items | |
# | |
# This script identifies duplicate files using their md5sum hash value. | |
# This can be done much easier in a language that supports hash lists such as PERL. I have done it in a shell script as both a proof-of-concept | |
# and to demonstrate the basic concept of storing/retrieving hash values and dealing with collisions. | |
# | |
# There are several good things demonstrated in this script: | |
# * implementing return values in bash functions | |
# * looping through files that contain spaces | |
# | |
# - Matthew Hayes <[email protected]> | |
# ================================================================================================= | |
shopt -s -o nounset # Option Explicit | |
declare -a TABLE # the hash table | |
declare -i TABLE_SIZE=65530 # size of the hash table | |
declare SEPARATOR="~" # delimiter for key/value | |
declare -r SCRIPT=${0##*/} # name of the script | |
declare -r MD5SUM='/usr/bin/md5sum' # used for hash generation | |
declare hash_of_file_RETURN # used for storing return value of hash_of_file | |
declare lookup_hash_RETURN # used for storing return value of lookup_hash | |
# ================================================================================================= | |
function new_table_pos | |
{ | |
declare HEXCODE # 4 hex digits of MD5 signature | |
declare -i CODE # the hex digits in decimal | |
HEXCODE=`echo "$1" | $MD5SUM` # compute the hash value (hex) | |
HEXCODE="${HEXCODE:0:4}" # take first 4 hex digits | |
CODE=`printf "%d\n" 0x"$HEXCODE"` # convert to 0 ... 65535 | |
printf "%d\n" "$((CODE%TABLE_SIZE))" # restrict to table size | |
} | |
readonly -f new_table_pos | |
function hash_of_file | |
{ | |
hash_of_file_RETURN=`$MD5SUM "$1"` # compute the hash value (hex) | |
hash_of_file_RETURN="${hash_of_file_RETURN:0:32}" # take first 32 hex digits (leaves off trailing garbage) | |
return 0 | |
} | |
readonly -f hash_of_file | |
# add_to_hash: add a key/value pair to the lookup hash | |
# (does not check for uniqueness) | |
# | |
# parameter 1: the key | |
# parameter 2: the associated value | |
function add_to_hash | |
{ | |
declare TEMP | |
declare -i ITEM=0 | |
# Check the number of parameters | |
if [ $# -ne 2 ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: Expected two parameters" >&2 | |
return 129 | |
fi | |
# Make sure we have both a key and value | |
if [ -z "$1" ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: cannot use empty key" >&2 | |
return 129 | |
fi | |
if [ -z "$2" ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: cannot add empty value" >&2 | |
return 129 | |
fi | |
# Make sure the parameter is not in the key or value | |
TEMP="${1/$SEPARATOR/_}" # remove the separator (if any) | |
if [ "$1" != "$TEMP" ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: Key $1 must not contain $SEPARATOR" >&2 | |
return 129 | |
fi | |
TEMP="${2/$SEPARATOR/_}" # remove the separator (if any) | |
if [ "$2" != "$TEMP" ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: Value $2 must not contain $SEPARATOR" >&2 | |
return 129 | |
fi | |
ITEM=`new_table_pos "$1"` | |
shopt -u -o nounset # empty spots will cause error | |
while [ -n "${TABLE[$ITEM]}" ] ; do # free position yet | |
printf "%s\n" "Position $ITEM for $1 in use, moving forward ..." | |
let "ITEM=ITEM+1" # if not, keep looking | |
done | |
shopt -s -o nounset # safe to use now | |
TABLE[$ITEM]="$1""$SEPARATOR""$2" # add the pair to the table | |
#printf "%s\n" "Added $1 / $2 at $ITEM" | |
return 0 | |
} | |
readonly -f add_to_hash | |
# --- lookup_hash: search for and display item matching key in the hash table --- | |
function lookup_hash | |
{ | |
declare -i ITEM=0 # position in array | |
declare KEY | |
declare VALUE | |
# They key must not contain the separator | |
TEMP="${1/$SEPARATOR/_}" # remove the separator (if any) | |
if [ "$1" != "$TEMP" ] ; then | |
printf "%s\n" "$SCRIPT: $FUNCNAME: Key must not contain $SEPARATOR" >&2 | |
return 129 | |
fi | |
ITEM=`new_table_pos "$1"` | |
shopt -u -o nounset # empty spots will cause errors | |
while [ -n "${TABLE[$ITEM]}" ] ; do # reached an empty spot? | |
KEY="${TABLE[$ITEM]}" # if not, get the pair | |
KEY="${KEY%$SEPARATOR*}" # extract the key part | |
if [ "$KEY" = "$1" ] ; then # is it the key we want? | |
break; # good, it matches | |
else # otherwise | |
let "ITEM=ITEM+1" # move to the next item | |
fi | |
done | |
# Display search results | |
if [ -z "${TABLE[$ITEM]}" ] ; then | |
#printf "%s\n" "$1 is not in the table" | |
return 129 | |
else | |
VALUE="${TABLE[$ITEM]}" | |
VALUE="${VALUE#*$SEPARATOR}" | |
lookup_hash_RETURN="$VALUE" | |
#printf "%s\n" "$1 has the value $VALUE" | |
return 0 | |
fi | |
} | |
readonly -f lookup_hash | |
# Main script begins | |
if [ ! -x "$MD5SUM" ] ; then | |
printf "Unable to run command %s\n" "$MD5SUM" >&2 | |
exit 129 | |
fi | |
if [ $# -ne 1 ] ; then | |
printf "%s\n" "usage: $SCRIPT directory_path" | |
exit 129 | |
fi | |
# Iterate regular files and escape spaces with '[SPACE]' | |
for ITEM in `find "$1" -type f | sed -e 's/ /[SPACE]/g'` ; do | |
# unescape spaces | |
declare FILEPATH | |
FILEPATH=`echo $ITEM | sed -e 's/\[SPACE\]/ /g'` | |
if [ -f "$FILEPATH" ] ; then | |
#declare FILENAME | |
declare HEXCODE | |
hash_of_file "$FILEPATH" | |
HEXCODE="$hash_of_file_RETURN" | |
# Get the text after the last '/' | |
# which is just the file name. | |
#FILENAME=${FILEPATH##*/} | |
#printf "The file name is: %s\n" "$FILENAME" | |
lookup_hash "$HEXCODE" | |
if [ $? -eq 0 ] ; then | |
printf "\n\nDuplicate for %s: \n%s\n%s\n\n" "$HEXCODE" "$lookup_hash_RETURN" "$FILEPATH" | |
else | |
add_to_hash "$HEXCODE" "$FILEPATH" | |
if [ $? -ne 0 ] ; then | |
printf "%s\n" "Unable to add item to table: $FILEPATH" | |
exit 129 | |
fi | |
fi | |
fi | |
done | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment