DataWraith · April 6, 2010 17:16
diff --git a/hash_dir.zsh b/hash_dir.zsh
 #!/bin/zsh

 # hash_dir.zsh recursively hashes the contents of the current directory.
 #
 # It is meant for use with file collections that are occasionally added to, and
 # will preserve older hashes if files are removed for some reason (for example,
 # moved to a different storage medium).
 #
 # To avoid recalculating hashes every time it is run, it caches file hashes by
 # full path, size and mtime.
 #
 # Needs the pipeview utility (pv) and the {md5, sha1, sha256, ...}sum utility.


 HASH="sha256"

 PWD=$(readlink -f "$(pwd)")
 BASENAME=`basename "${PWD}"`
 DIGESTFILE=${BASENAME}.${HASH}sum
 CACHEFILE="$HOME/.cache/hash_dir.$HASH.cache"
 CACHE_LINES=5000

 # Make sure the needed utilities are available
 PV=`which pv`
 if [ $? -gt 0 ]
 then
    echo "Pipeview utility not found!"
    exit 1
 fi

 HASHPROG=`which ${HASH}sum`
 if [ $? -gt 0 ]
 then
    echo "${HASH}sum utility not found!"
    exit 1
 fi

 TMP1=`mktemp` || exit 1
 TMP2=`mktemp` || exit 1

 touch "$CACHEFILE"
 touch "$DIGESTFILE"

 # Make sure we can write to the DIGESTFILE
 if ! [ -w "$DIGESTFILE" ]
 then
        chmod ugo+w "${DIGESTFILE}"
        if [ $? -gt 0 ]
        then
                echo "${DIGESTFILE} is not writable!"
                exit 1
        fi
 fi

 # Make sure the cache is writable
 if ! [ -w "$CACHEFILE" ]
 then
    echo "$CACHEFILE is not writable!"
 fi

 (
        IFS=$'\0'

        echo $PWD

        # Find files to hash
        # The head --bytes=-1 is needed to chop off the last \0
        FILES=($(find $(readlink -f "$PWD") -type f -not -name "${DIGESTFILE}" \
        -not -name ".directory" -not -name "*~" -print0 | sort -z | head       \
        --bytes=-1 - ))

        FILES_LEN=${#FILES[@]}

        for i in {1..$FILES_LEN}
        do
                # See if the file is cached
                CACHE_KEY=$(stat --printf="%s\t%Y\t%n" "${FILES[$i]}")
                CACHE_ENTRY=$(grep -m 1 -s "$CACHE_KEY" "$CACHEFILE")

                echo -n "[$i/${FILES_LEN}] "
                if [ -z "$CACHE_ENTRY" ]; then
                    # Nope, it's not cached, calculate hash
                    echo "Hashing ${FILES[$i]}..."
                    FILE_HASH=$($PV "${FILES[$i]}" | ${HASHPROG} | cut -d" " -f-1)
                    echo -e "$FILE_HASH\t$CACHE_KEY" >> "$CACHEFILE"
                    echo
                else
                    # Yep, use the cached hash.
                    echo "${FILES[$i]} cached."
                    echo
                    FILE_HASH=$(echo "$CACHE_ENTRY" | cut -f1)
                fi

                echo "$FILE_HASH  ${FILES[$i]##$PWD/}" >> "${TMP1}"

                # Set file to read-only (after all, we don't want the hash changed, do we?)
                chmod ugo-w "${FILES[$i]}"
        done
 )

 # Merge old DIGESTFILE with new hashes
 sort -u --key=2 "${TMP1}" "${DIGESTFILE}" > "${TMP2}" &&
 cp -f "${TMP2}" "${DIGESTFILE}" && chmod ugo-w "${DIGESTFILE}"

 # Remove temporary files
 rm -f -- "${TMP1}" "${TMP2}"

 # Trim the Cache
 tail --lines="$CACHE_LINES" "$CACHEFILE" > "$CACHEFILE.tmp" &&
 mv -f -- "$CACHEFILE.tmp" "$CACHEFILE"
	#!/bin/zsh

	# hash_dir.zsh recursively hashes the contents of the current directory.
	#
	# It is meant for use with file collections that are occasionally added to, and
	# will preserve older hashes if files are removed for some reason (for example,
	# moved to a different storage medium).
	#
	# To avoid recalculating hashes every time it is run, it caches file hashes by
	# full path, size and mtime.
	#
	# Needs the pipeview utility (pv) and the {md5, sha1, sha256, ...}sum utility.


	HASH="sha256"

	PWD=$(readlink -f "$(pwd)")
	BASENAME=`basename "${PWD}"`
	DIGESTFILE=${BASENAME}.${HASH}sum
	CACHEFILE="$HOME/.cache/hash_dir.$HASH.cache"
	CACHE_LINES=5000

	# Make sure the needed utilities are available
	PV=`which pv`
	if [ $? -gt 0 ]
	then
	echo "Pipeview utility not found!"
	exit 1
	fi

	HASHPROG=`which ${HASH}sum`
	if [ $? -gt 0 ]
	then
	echo "${HASH}sum utility not found!"
	exit 1
	fi

	TMP1=`mktemp` \|\| exit 1
	TMP2=`mktemp` \|\| exit 1

	touch "$CACHEFILE"
	touch "$DIGESTFILE"

	# Make sure we can write to the DIGESTFILE
	if ! [ -w "$DIGESTFILE" ]
	then
	chmod ugo+w "${DIGESTFILE}"
	if [ $? -gt 0 ]
	then
	echo "${DIGESTFILE} is not writable!"
	exit 1
	fi
	fi

	# Make sure the cache is writable
	if ! [ -w "$CACHEFILE" ]
	then
	echo "$CACHEFILE is not writable!"
	fi

	(
	IFS=$'\0'

	echo $PWD

	# Find files to hash
	# The head --bytes=-1 is needed to chop off the last \0
	FILES=($(find $(readlink -f "$PWD") -type f -not -name "${DIGESTFILE}" \
	-not -name ".directory" -not -name "*~" -print0 \| sort -z \| head \
	--bytes=-1 - ))

	FILES_LEN=${#FILES[@]}

	for i in {1..$FILES_LEN}
	do
	# See if the file is cached
	CACHE_KEY=$(stat --printf="%s\t%Y\t%n" "${FILES[$i]}")
	CACHE_ENTRY=$(grep -m 1 -s "$CACHE_KEY" "$CACHEFILE")

	echo -n "[$i/${FILES_LEN}] "
	if [ -z "$CACHE_ENTRY" ]; then
	# Nope, it's not cached, calculate hash
	echo "Hashing ${FILES[$i]}..."
	FILE_HASH=$($PV "${FILES[$i]}" \| ${HASHPROG} \| cut -d" " -f-1)
	echo -e "$FILE_HASH\t$CACHE_KEY" >> "$CACHEFILE"
	echo
	else
	# Yep, use the cached hash.
	echo "${FILES[$i]} cached."
	echo
	FILE_HASH=$(echo "$CACHE_ENTRY" \| cut -f1)
	fi

	echo "$FILE_HASH ${FILES[$i]##$PWD/}" >> "${TMP1}"

	# Set file to read-only (after all, we don't want the hash changed, do we?)
	chmod ugo-w "${FILES[$i]}"
	done
	)

	# Merge old DIGESTFILE with new hashes
	sort -u --key=2 "${TMP1}" "${DIGESTFILE}" > "${TMP2}" &&
	cp -f "${TMP2}" "${DIGESTFILE}" && chmod ugo-w "${DIGESTFILE}"

	# Remove temporary files
	rm -f -- "${TMP1}" "${TMP2}"

	# Trim the Cache
	tail --lines="$CACHE_LINES" "$CACHEFILE" > "$CACHEFILE.tmp" &&
	mv -f -- "$CACHEFILE.tmp" "$CACHEFILE"