#!/bin/bash if [ $# -lt 1 ]; then echo "Usage: ./dedupe.sh [path]" exit 1 fi BASE=$1 MAX_SIZE="25M" FILTER='\.(mca|log|gz)$' cd $BASE ## Get list of files echo -n "Finding files... " files=$(find $(ls | egrep '^server[0-9]+$') -type f -size -$MAX_SIZE | egrep -v '\\' | egrep -vE $FILTER) file_count=$(echo "$files" | wc -l) echo "Done! Found $file_count." ## Compute hashes echo "Computing hashes... " hash_count=0 hash_output="/tmp/dedupe.tmp" >$hash_output for file in $files; do if [ ! -e "$file" ]; then continue; fi sha1sum "$file" 2>/dev/null >> $hash_output hash_count=$((hash_count + 1)) ## Display progress bar if [ $(($hash_count % 10)) -eq 0 ]; then progress=$((hash_count * 30 / file_count)) remaining=$((30 - progress)) echo -ne "\r[" # echo -n $(perl -E 'say "=" x '$progress) a=0; while [ $a -lt $progress ]; do echo -n "="; a=$((a+1)); done a=0; while [ $a -lt $remaining ]; do echo -n " "; a=$((a+1)); done echo -n "] $hash_count / $file_count" fi done echo ## Find duplicates echo -n "Finding duplicates... " dupe_output="/tmp/dedupe-dupes.tmp" awk '{ print $1 }' $hash_output | sort | uniq -c | egrep -Ev "^(\s+)1\s[a-zA-Z0-9]{40}" | sort -n | awk '{ print $1" "$2 }' > $dupe_output dupe_count=$(wc -l $dupe_output | cut -d' ' -f1) echo "Done! Found $dupe_count." ## Calculate storage savings echo -n "Performing calculations... " total_sum=0 dupe_sum=0 while read i; do count=$(echo "$i" | awk '{print $1}') checksum=$(echo "$i" | awk '{print $2}') file=$(grep $checksum $hash_output | head -1 | awk '{print $2}') size=$(ls -l "$BASE/$file" | awk '{print $5}') total_size=$(echo "$count * $size" | bc) total_sum=$((total_sum + size)) dupe_sum=$((dupe_sum + total_size)) done < $dupe_output savings=$((dupe_sum - total_sum)) savings=$((savings / 1024 / 1024)) total_dir_size=$(du -s $BASE | awk '{ print $1 }') perc=$((savings * 100 / total_dir_size)) echo "Deduplicated savings: $savings MB ($perc%)" ## Clean up rm -f $hash_output $dupe_output