Created
April 6, 2010 17:16
-
-
Save DataWraith/357830 to your computer and use it in GitHub Desktop.
Script to recursively hash the contents of the current directory, with a nice progress display
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# hash_dir.zsh recursively hashes the contents of the current directory. | |
# | |
# It is meant for use with file collections that are occasionally added to, and | |
# will preserve older hashes if files are removed for some reason (for example, | |
# moved to a different storage medium). | |
# | |
# To avoid recalculating hashes every time it is run, it caches file hashes by | |
# full path, size and mtime. | |
# | |
# Needs the pipeview utility (pv) and the {md5, sha1, sha256, ...}sum utility. | |
HASH="sha256" | |
PWD=$(readlink -f "$(pwd)") | |
BASENAME=`basename "${PWD}"` | |
DIGESTFILE=${BASENAME}.${HASH}sum | |
CACHEFILE="$HOME/.cache/hash_dir.$HASH.cache" | |
CACHE_LINES=5000 | |
# Make sure the needed utilities are available | |
PV=`which pv` | |
if [ $? -gt 0 ] | |
then | |
echo "Pipeview utility not found!" | |
exit 1 | |
fi | |
HASHPROG=`which ${HASH}sum` | |
if [ $? -gt 0 ] | |
then | |
echo "${HASH}sum utility not found!" | |
exit 1 | |
fi | |
TMP1=`mktemp` || exit 1 | |
TMP2=`mktemp` || exit 1 | |
touch "$CACHEFILE" | |
touch "$DIGESTFILE" | |
# Make sure we can write to the DIGESTFILE | |
if ! [ -w "$DIGESTFILE" ] | |
then | |
chmod ugo+w "${DIGESTFILE}" | |
if [ $? -gt 0 ] | |
then | |
echo "${DIGESTFILE} is not writable!" | |
exit 1 | |
fi | |
fi | |
# Make sure the cache is writable | |
if ! [ -w "$CACHEFILE" ] | |
then | |
echo "$CACHEFILE is not writable!" | |
fi | |
( | |
IFS=$'\0' | |
echo $PWD | |
# Find files to hash | |
# The head --bytes=-1 is needed to chop off the last \0 | |
FILES=($(find $(readlink -f "$PWD") -type f -not -name "${DIGESTFILE}" \ | |
-not -name ".directory" -not -name "*~" -print0 | sort -z | head \ | |
--bytes=-1 - )) | |
FILES_LEN=${#FILES[@]} | |
for i in {1..$FILES_LEN} | |
do | |
# See if the file is cached | |
CACHE_KEY=$(stat --printf="%s\t%Y\t%n" "${FILES[$i]}") | |
CACHE_ENTRY=$(grep -m 1 -s "$CACHE_KEY" "$CACHEFILE") | |
echo -n "[$i/${FILES_LEN}] " | |
if [ -z "$CACHE_ENTRY" ]; then | |
# Nope, it's not cached, calculate hash | |
echo "Hashing ${FILES[$i]}..." | |
FILE_HASH=$($PV "${FILES[$i]}" | ${HASHPROG} | cut -d" " -f-1) | |
echo -e "$FILE_HASH\t$CACHE_KEY" >> "$CACHEFILE" | |
echo | |
else | |
# Yep, use the cached hash. | |
echo "${FILES[$i]} cached." | |
echo | |
FILE_HASH=$(echo "$CACHE_ENTRY" | cut -f1) | |
fi | |
echo "$FILE_HASH ${FILES[$i]##$PWD/}" >> "${TMP1}" | |
# Set file to read-only (after all, we don't want the hash changed, do we?) | |
chmod ugo-w "${FILES[$i]}" | |
done | |
) | |
# Merge old DIGESTFILE with new hashes | |
sort -u --key=2 "${TMP1}" "${DIGESTFILE}" > "${TMP2}" && | |
cp -f "${TMP2}" "${DIGESTFILE}" && chmod ugo-w "${DIGESTFILE}" | |
# Remove temporary files | |
rm -f -- "${TMP1}" "${TMP2}" | |
# Trim the Cache | |
tail --lines="$CACHE_LINES" "$CACHEFILE" > "$CACHEFILE.tmp" && | |
mv -f -- "$CACHEFILE.tmp" "$CACHEFILE" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment