Last active
November 23, 2020 21:03
-
-
Save milkersarac/6366400 to your computer and use it in GitHub Desktop.
Near duplicate detection and deletion script. This script uses another script findimagedupes.pl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#This script iterates over files in a folder to find near duplicate images. | |
#Threshold for duplicacy is set to 80%. | |
#This script calls a perl script 'findimagedupes.pl', it is recommended to run | |
# the perl script first to see if it is working appropriately. | |
# $perl findimagedupes.pl image1.jpg image2.jpg | |
#This script deletes the duplicate files and creates 2 txt files after process. | |
# So be careful about your first run, take your back up files as needed. No kind of warranty is provided. | |
# | |
#author: @milkers 28.08.2013 | |
DELETED=tobedeleted.txt | |
LOGGED=tobelogged.txt | |
rm $DELETED | |
rm $LOGGED | |
#please correct the directory below before running the script. | |
FILES="/home/user/Desktop/folder/*" | |
COUNTER=`expr 1 + 0` | |
#read all filenames to the array first. | |
for file in $FILES | |
do | |
# echo $file | |
ARRAY[$COUNTER]=$file | |
COUNTER=`expr $COUNTER + 1` | |
done | |
ARRAYSIZE=${#ARRAY[@]} | |
echo Array size is $ARRAYSIZE | |
for ((i=1;i<=`expr $ARRAYSIZE - 1`;i++)) | |
do | |
if [[ ! -f ${ARRAY[$i]} ]]; then | |
#statements | |
echo "File not found" | |
continue | |
fi | |
echo "i: $i -- ${ARRAY[$i]}" | |
for (( j=`expr $i + 1`; j<=$ARRAYSIZE; j++ )) | |
do | |
if [[ ! -f ${ARRAY[$j]} ]]; then | |
#statements | |
echo "Inner file not found" | |
continue | |
fi | |
OUT=$(echo `exec perl findimagedupes.pl ${ARRAY[$i]} ${ARRAY[$j]}`) | |
RESULT=$(echo $OUT | sed 's/.*be \([0-9]*\).*/\1/') | |
# echo $RESULT | |
THRESHOLD=80 | |
if [[ "$RESULT" -gt "$THRESHOLD" ]]; then | |
#statements | |
echo "${ARRAY[$j]}" >> $DELETED | |
echo "$RESULT > $THRESHOLD - i: $i=${ARRAY[$i]} or j: $j=${ARRAY[$j]} will be deleted" >> $LOGGED | |
rm "${ARRAY[$j]}" | |
echo "-HIT! $RESULT > $THRESHOLD i: $i=${ARRAY[$i]} and j: $j=${ARRAY[$j]}" | |
# else | |
# echo "-- we will NOT take this image pair! $RESULT < $THRESHOLD" | |
fi | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment