#!/bin/bash

if [ $# -lt 1 ]; then
    echo "Usage: ./dedupe.sh [path]"
    exit 1
fi

BASE=$1
MAX_SIZE="25M"
FILTER='\.(mca|log|gz)$'

cd $BASE

## Get list of files
echo -n "Finding files... "

files=$(find $(ls | egrep '^server[0-9]+$') -type f -size -$MAX_SIZE | egrep -v '\\' | egrep -vE $FILTER)
file_count=$(echo "$files" | wc -l)

echo "Done! Found $file_count."

## Compute hashes
echo "Computing hashes... "

hash_count=0
hash_output="/tmp/dedupe.tmp"
>$hash_output

for file in $files; do
    if [ ! -e "$file" ]; then continue; fi

    sha1sum "$file" 2>/dev/null >> $hash_output
    hash_count=$((hash_count + 1))

    ## Display progress bar
    if [ $(($hash_count % 10)) -eq 0 ]; then
        progress=$((hash_count * 30 / file_count))
        remaining=$((30 - progress))

        echo -ne "\r["
        # echo -n $(perl -E 'say "=" x '$progress)
        a=0; while [ $a -lt $progress ]; do echo -n "="; a=$((a+1)); done
        a=0; while [ $a -lt $remaining ]; do echo -n " "; a=$((a+1)); done
        echo -n "] $hash_count / $file_count"
    fi
done

echo

## Find duplicates
echo -n "Finding duplicates... "

dupe_output="/tmp/dedupe-dupes.tmp"
awk '{ print $1 }' $hash_output | sort | uniq -c | egrep -Ev "^(\s+)1\s[a-zA-Z0-9]{40}" | sort -n | awk '{ print $1" "$2 }' > $dupe_output
dupe_count=$(wc -l $dupe_output | cut -d' ' -f1)

echo "Done! Found $dupe_count."

## Calculate storage savings
echo -n "Performing calculations... "

total_sum=0
dupe_sum=0
while read i; do
    count=$(echo "$i" | awk '{print $1}')
    checksum=$(echo "$i" | awk '{print $2}')
    file=$(grep $checksum $hash_output | head -1 | awk '{print $2}')
    size=$(ls -l "$BASE/$file" | awk '{print $5}')
    total_size=$(echo "$count * $size" | bc)

    total_sum=$((total_sum + size))
    dupe_sum=$((dupe_sum + total_size))
done < $dupe_output

savings=$((dupe_sum - total_sum))
savings=$((savings / 1024 / 1024))
total_dir_size=$(du -s $BASE | awk '{ print $1 }')
perc=$((savings * 100 / total_dir_size))
echo "Deduplicated savings: $savings MB ($perc%)"

## Clean up
rm -f $hash_output $dupe_output