allthingscode · January 28, 2011 21:47
diff --git a/Find Duplicate Files b/Find Duplicate Files
 #!/bin/bash
 #
 # lsdup.sh
 #
 # create a hash table and look up items
 #
 # This script identifies duplicate files using their md5sum hash value. 
 # This can be done much easier in a language that supports hash lists such as PERL. I have done it in a shell script as both a proof-of-concept 
 # and to demonstrate the basic concept of storing/retrieving hash values and dealing with collisions. 
 # 
 # There are several good things demonstrated in this script:
 # * implementing return values in bash functions
 # * looping through files that contain spaces
 #
 # - Matthew Hayes <[email protected]>
 # =================================================================================================
 shopt -s -o nounset                     # Option Explicit
 
 declare -a  TABLE                       # the hash table
 declare -i  TABLE_SIZE=65530            # size of the hash table
 declare     SEPARATOR="~"               # delimiter for key/value
 declare -r  SCRIPT=${0##*/}             # name of the script
 declare -r  MD5SUM='/usr/bin/md5sum'    # used for hash generation
 
 declare hash_of_file_RETURN             # used for storing return value of hash_of_file
 declare lookup_hash_RETURN              # used for storing return value of lookup_hash
 # =================================================================================================
 
 
 function new_table_pos
 {
    declare HEXCODE                 # 4 hex digits of MD5 signature
    declare -i CODE                 # the hex digits in decimal
 
    HEXCODE=`echo "$1" | $MD5SUM`           # compute the hash value (hex)
    HEXCODE="${HEXCODE:0:4}"                # take first 4 hex digits
    CODE=`printf "%d\n" 0x"$HEXCODE"`       # convert to 0 ... 65535
    printf "%d\n" "$((CODE%TABLE_SIZE))"    # restrict to table size
 }
 readonly -f new_table_pos
 
 
 function hash_of_file
 {
    hash_of_file_RETURN=`$MD5SUM "$1"`                  # compute the hash value (hex)
    hash_of_file_RETURN="${hash_of_file_RETURN:0:32}"   # take first 32 hex digits (leaves off trailing garbage)
    return 0
 }
 readonly -f hash_of_file
 
 
 
 
 # add_to_hash: add a key/value pair to the lookup hash
 #              (does not check for uniqueness)
 #
 # parameter 1: the key
 # parameter 2: the associated value
 function add_to_hash
 {
    declare TEMP
    declare -i ITEM=0
 
    # Check the number of parameters
    if [ $# -ne 2 ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: Expected two parameters" >&2
        return 129
    fi
 
    # Make sure we have both a key and value
    if [ -z "$1" ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: cannot use empty key" >&2
        return 129
    fi
    if [ -z "$2" ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: cannot add empty value" >&2
        return 129
    fi
 
    # Make sure the parameter is not in the key or value
    TEMP="${1/$SEPARATOR/_}"                    # remove the separator (if any)
    if [ "$1" != "$TEMP" ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: Key $1 must not contain $SEPARATOR" >&2
        return 129
    fi
    TEMP="${2/$SEPARATOR/_}"                    # remove the separator (if any)
    if [ "$2" != "$TEMP" ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: Value $2 must not contain $SEPARATOR" >&2
        return 129
    fi
 
    ITEM=`new_table_pos "$1"`
 
    shopt -u -o nounset                         # empty spots will cause error
    while [ -n "${TABLE[$ITEM]}" ] ; do         # free position yet
        printf "%s\n" "Position $ITEM for $1 in use, moving forward ..."
        let "ITEM=ITEM+1"                       # if not, keep looking
    done
    shopt -s -o nounset                         # safe to use now
 
    TABLE[$ITEM]="$1""$SEPARATOR""$2"           # add the pair to the table
    #printf "%s\n" "Added $1 / $2 at $ITEM"
 
        return 0
 }
 readonly -f add_to_hash
 
 
 
 # --- lookup_hash: search for and display item matching key in the hash table ---
 function lookup_hash
 {
    declare -i ITEM=0                           # position in array
    declare KEY
    declare VALUE
 
    # They key must not contain the separator
 
    TEMP="${1/$SEPARATOR/_}"                    # remove the separator (if any)
    if [ "$1" != "$TEMP" ] ; then
        printf "%s\n" "$SCRIPT: $FUNCNAME: Key must not contain $SEPARATOR" >&2
        return 129
    fi
 
    ITEM=`new_table_pos "$1"`
 
    shopt -u -o nounset                         # empty spots will cause errors
    while [ -n "${TABLE[$ITEM]}" ] ; do         # reached an empty spot?
        KEY="${TABLE[$ITEM]}"                   # if not, get the pair
        KEY="${KEY%$SEPARATOR*}"                # extract the key part
        if [ "$KEY" = "$1" ] ; then             # is it the key we want?
            break;                              # good, it matches
        else                                    # otherwise
            let "ITEM=ITEM+1"                   # move to the next item
        fi
    done
 
    # Display search results
    if [ -z "${TABLE[$ITEM]}" ] ; then
        #printf "%s\n" "$1 is not in the table"
        return 129
    else
        VALUE="${TABLE[$ITEM]}"
        VALUE="${VALUE#*$SEPARATOR}"
        lookup_hash_RETURN="$VALUE"
        #printf "%s\n" "$1 has the value $VALUE"
        return 0
    fi
 }
 readonly -f lookup_hash
 
 
 
 
 # Main script begins
 
 if [ ! -x "$MD5SUM" ] ; then
    printf "Unable to run command %s\n" "$MD5SUM" >&2
    exit 129
 fi
 
 if [ $# -ne 1 ] ; then
    printf "%s\n" "usage: $SCRIPT directory_path"
    exit 129
 fi
 
 
 # Iterate regular files and escape spaces with '[SPACE]'
 for ITEM in `find "$1" -type f | sed -e 's/ /[SPACE]/g'` ; do
 
    # unescape spaces
    declare FILEPATH
    FILEPATH=`echo $ITEM | sed -e 's/\[SPACE\]/ /g'`
 
    if [ -f "$FILEPATH" ] ; then
        #declare FILENAME
        declare HEXCODE
 
        hash_of_file "$FILEPATH"
        HEXCODE="$hash_of_file_RETURN"
 
        # Get the text after the last '/'
        # which is just the file name.
        #FILENAME=${FILEPATH##*/}
        #printf "The file name is: %s\n" "$FILENAME"
 
        lookup_hash "$HEXCODE"
        if [ $? -eq 0 ] ; then
            printf "\n\nDuplicate for %s: \n%s\n%s\n\n" "$HEXCODE" "$lookup_hash_RETURN" "$FILEPATH"
        else
            add_to_hash "$HEXCODE" "$FILEPATH"
            if [ $? -ne 0 ] ; then
                printf "%s\n" "Unable to add item to table: $FILEPATH"
                exit 129
            fi
        fi
    fi
 done
 
 exit 0
	#!/bin/bash
	#
	# lsdup.sh
	#
	# create a hash table and look up items
	#
	# This script identifies duplicate files using their md5sum hash value.
	# This can be done much easier in a language that supports hash lists such as PERL. I have done it in a shell script as both a proof-of-concept
	# and to demonstrate the basic concept of storing/retrieving hash values and dealing with collisions.
	#
	# There are several good things demonstrated in this script:
	# * implementing return values in bash functions
	# * looping through files that contain spaces
	#
	# - Matthew Hayes <[email protected]>
	# =================================================================================================
	shopt -s -o nounset # Option Explicit

	declare -a TABLE # the hash table
	declare -i TABLE_SIZE=65530 # size of the hash table
	declare SEPARATOR="~" # delimiter for key/value
	declare -r SCRIPT=${0##*/} # name of the script
	declare -r MD5SUM='/usr/bin/md5sum' # used for hash generation

	declare hash_of_file_RETURN # used for storing return value of hash_of_file
	declare lookup_hash_RETURN # used for storing return value of lookup_hash
	# =================================================================================================


	function new_table_pos
	{
	declare HEXCODE # 4 hex digits of MD5 signature
	declare -i CODE # the hex digits in decimal

	HEXCODE=`echo "$1" \| $MD5SUM` # compute the hash value (hex)
	HEXCODE="${HEXCODE:0:4}" # take first 4 hex digits
	CODE=`printf "%d\n" 0x"$HEXCODE"` # convert to 0 ... 65535
	printf "%d\n" "$((CODE%TABLE_SIZE))" # restrict to table size
	}
	readonly -f new_table_pos


	function hash_of_file
	{
	hash_of_file_RETURN=`$MD5SUM "$1"` # compute the hash value (hex)
	hash_of_file_RETURN="${hash_of_file_RETURN:0:32}" # take first 32 hex digits (leaves off trailing garbage)
	return 0
	}
	readonly -f hash_of_file




	# add_to_hash: add a key/value pair to the lookup hash
	# (does not check for uniqueness)
	#
	# parameter 1: the key
	# parameter 2: the associated value
	function add_to_hash
	{
	declare TEMP
	declare -i ITEM=0

	# Check the number of parameters
	if [ $# -ne 2 ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: Expected two parameters" >&2
	return 129
	fi

	# Make sure we have both a key and value
	if [ -z "$1" ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: cannot use empty key" >&2
	return 129
	fi
	if [ -z "$2" ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: cannot add empty value" >&2
	return 129
	fi

	# Make sure the parameter is not in the key or value
	TEMP="${1/$SEPARATOR/_}" # remove the separator (if any)
	if [ "$1" != "$TEMP" ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: Key $1 must not contain $SEPARATOR" >&2
	return 129
	fi
	TEMP="${2/$SEPARATOR/_}" # remove the separator (if any)
	if [ "$2" != "$TEMP" ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: Value $2 must not contain $SEPARATOR" >&2
	return 129
	fi

	ITEM=`new_table_pos "$1"`

	shopt -u -o nounset # empty spots will cause error
	while [ -n "${TABLE[$ITEM]}" ] ; do # free position yet
	printf "%s\n" "Position $ITEM for $1 in use, moving forward ..."
	let "ITEM=ITEM+1" # if not, keep looking
	done
	shopt -s -o nounset # safe to use now

	TABLE[$ITEM]="$1""$SEPARATOR""$2" # add the pair to the table
	#printf "%s\n" "Added $1 / $2 at $ITEM"

	return 0
	}
	readonly -f add_to_hash



	# --- lookup_hash: search for and display item matching key in the hash table ---
	function lookup_hash
	{
	declare -i ITEM=0 # position in array
	declare KEY
	declare VALUE

	# They key must not contain the separator

	TEMP="${1/$SEPARATOR/_}" # remove the separator (if any)
	if [ "$1" != "$TEMP" ] ; then
	printf "%s\n" "$SCRIPT: $FUNCNAME: Key must not contain $SEPARATOR" >&2
	return 129
	fi

	ITEM=`new_table_pos "$1"`

	shopt -u -o nounset # empty spots will cause errors
	while [ -n "${TABLE[$ITEM]}" ] ; do # reached an empty spot?
	KEY="${TABLE[$ITEM]}" # if not, get the pair
	KEY="${KEY%$SEPARATOR*}" # extract the key part
	if [ "$KEY" = "$1" ] ; then # is it the key we want?
	break; # good, it matches
	else # otherwise
	let "ITEM=ITEM+1" # move to the next item
	fi
	done

	# Display search results
	if [ -z "${TABLE[$ITEM]}" ] ; then
	#printf "%s\n" "$1 is not in the table"
	return 129
	else
	VALUE="${TABLE[$ITEM]}"
	VALUE="${VALUE#*$SEPARATOR}"
	lookup_hash_RETURN="$VALUE"
	#printf "%s\n" "$1 has the value $VALUE"
	return 0
	fi
	}
	readonly -f lookup_hash




	# Main script begins

	if [ ! -x "$MD5SUM" ] ; then
	printf "Unable to run command %s\n" "$MD5SUM" >&2
	exit 129
	fi

	if [ $# -ne 1 ] ; then
	printf "%s\n" "usage: $SCRIPT directory_path"
	exit 129
	fi


	# Iterate regular files and escape spaces with '[SPACE]'
	for ITEM in `find "$1" -type f \| sed -e 's/ /[SPACE]/g'` ; do

	# unescape spaces
	declare FILEPATH
	FILEPATH=`echo $ITEM \| sed -e 's/\[SPACE\]/ /g'`

	if [ -f "$FILEPATH" ] ; then
	#declare FILENAME
	declare HEXCODE

	hash_of_file "$FILEPATH"
	HEXCODE="$hash_of_file_RETURN"

	# Get the text after the last '/'
	# which is just the file name.
	#FILENAME=${FILEPATH##*/}
	#printf "The file name is: %s\n" "$FILENAME"

	lookup_hash "$HEXCODE"
	if [ $? -eq 0 ] ; then
	printf "\n\nDuplicate for %s: \n%s\n%s\n\n" "$HEXCODE" "$lookup_hash_RETURN" "$FILEPATH"
	else
	add_to_hash "$HEXCODE" "$FILEPATH"
	if [ $? -ne 0 ] ; then
	printf "%s\n" "Unable to add item to table: $FILEPATH"
	exit 129
	fi
	fi
	fi
	done

	exit 0