Skip to content

Instantly share code, notes, and snippets.

@milkersarac
Created August 23, 2013 12:41
Show Gist options
  • Save milkersarac/6318909 to your computer and use it in GitHub Desktop.
Save milkersarac/6318909 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
# findimagedupes (name to be changed later)
# copyright 2001 rob kudla
# licensed under the GNU Public License version 2.0 or later
#
# strengths: recognizes similar pictures with 98% accuracy when
# the pictures actually have unique features; generates
# collection files for easy managing of dupes with
# gqview (hopefully pixie someday)
# weaknesses: lots of false positives on contact sheets and
# shots of things like ocean horizons, which all
# reduce to basically this:
#
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 1111111111111111
# 0000000000000000
# 0000000000000000
# 0000000000000000
# 0000000000000000
# 0000000000000000
# 0000000000000000
# 0000000000000000
# 0000000000000000; slows down geometrically with
# larger image collections (>18 hours for 25144
# images, ~15 minutes for 2500)
#
# i've tried moving to an 8x8x8 bit array (twice the size but allows
# greyscale comparisons) and it actually causes more false positives,
# probably because 8x8 pixels just provides too little detail.
use Image::Magick;
#use strict;
use warnings;
use File::Basename;
my $VERSION = "0.1.3";
my $prog = $0 . '';
$prog = substr($prog,rindex($prog,'/') + 1) if rindex($prog,'/') >= 0;
my $image;
sub handleSEGV { die "caught segfault in getfingerprint()\n"; };
# check args
my %OPT;
while (my $arg = shift) {
if ($arg eq '-rescan') {
$OPT{'rescan'} = 1;
} elsif ($arg eq '-update') {
$OPT{'update'} = 1;
} elsif ($arg eq '-t') {
$OPT{'threshold'} = shift;
} elsif ($arg eq '-f') {
$OPT{'dbfile'} = shift;
} elsif ($arg eq '-?' || $arg eq '-h' || $arg eq '--help') {
$OPT{'help'} = 1;
} elsif ($arg eq '-d') {
$OPT{'scandir'} = shift;
} elsif ($arg eq '-v') {
$OPT{'viewpgm'} = shift;
} elsif ($arg eq '-c') {
$OPT{'gqvfile'} = shift;
} elsif ($arg eq '-p') {
$OPT{'printfp'} = 1;
} elsif ($arg eq '-verbose') {
$OPT{'verbose'} = 1;
} elsif ($arg eq '-include-hidden') {
$OPT{'include-hidden'} = 1;
} elsif ($arg eq '-g') {
$OPT{'guimode'} = 1;
} else { # assume it's a filename
if (!defined($OPT{'file1'})) {
$OPT{'file1'} = $arg;
} else {
$OPT{'file2'} = $arg;
}
}
}
# slap user if only one filename specified
$OPT{'help'} = 1 if (defined $OPT{'file1'} && !defined $OPT{'file2'});
# print help message if needed
if ($OPT{'help'}) {
print "$prog - Copyright 2001 Rob Kudla - http://www.kudla.org/raindog
This program is distributed under the terms of the GNU Public License;
see the file COPYING for details.
Usage: $prog [options] [<file1> <file2>]
Options:
-rescan = rescan fingerprints of all files in directory
-update = like rescan, but do not read Files already in DB File
and erase Entries not found any longer in Filesystem
-f <file> = use <file> as image fingerprint database
-d <dir> = scan <dir> instead of current directory
-t <num> = use <num> as threshold% of similarity (default 90)
-v <program> = launch <program> (in bg) to view each set of dupes
-c <file> = create GQView collection <file>.gqv of duplicates
<file1> <file2> = diff just those two files, using -v if present
(other options ignored if files are specified)
-p = only valid when files specified; prints the
hex of the actual fingerprint of each file.
-g = GUI mode: produce only machine-friendly output.
-verbose = Verbose Output\n";
exit 0;
}
# set up defaults
$OPT{'scandir'} = '.' unless defined $OPT{'scandir'};
$OPT{'dbfile'} = 'imagedupes-db.txt' unless defined $OPT{'dbfile'};
$OPT{'dbfile'} = "$OPT{'scandir'}/$OPT{'dbfile'}" unless $OPT{'dbfile'} =~ "/";
$OPT{'threshold'} = 90 unless defined $OPT{'threshold'} && $OPT{'threshold'} > 0;
$OPT{'curdir'} = `pwd`;
chop $OPT{'curdir'};
$OPT{'update'} ||=0;
$OPT{'rescan'} ||=0;
$OPT{'include-hidden'} ||= 0;
# set up gqvfile if needed
if (defined $OPT{'gqvfile'} && defined $OPT{'file1'}) {
$OPT{'gqvfile'} = "$OPT{'gqvfile'}.gqv" if $OPT{'gqvfile'} !~ /\.gqv$/;
open GQV, ">$OPT{'gqvfile'}";
print GQV "#GQView collection\n#Created with $prog version $VERSION\n";
close GQV;
}
# set up countbits array
my @countbits_arr;
for (my $i = 0; $i < 256; $i++) {
$countbits_arr[$i] = _countbits (chr($i));
}
# get columns if we can
my $cols;
if (!defined $OPT{'guimode'}) {
$cols = `tput cols`;
$cols += 0;
$cols = 80 if $cols == 0;
}
# scan files in if the user wants or there's no database now
if ( $OPT{'rescan'}>0 || $OPT{'update'}>0 ||
( (!-e $OPT{'dbfile'}) || (-s $OPT{'dbfile'} == 0)) &&
!defined $OPT{'file1'} ) {
print "Scanning fingerprints from $OPT{'scandir'} into $OPT{'dbfile'}.\n"
if !(defined $OPT{'guimode'});
# get whole tree
my $list = `find $OPT{'scandir'} -type f | sort`;
# TODO: We should use File::Find here
# get imagemagick object
$image = Image::Magick->new;
my $existing_summs={};
# create dbfile
if ( $OPT{'update'} ) {
open IMGFP, "<$OPT{'dbfile'}";
open IMGFPO, ">$OPT{'dbfile'}.sik";
while (my $line = <IMGFP> ) {
my ($file,$sum)=split(":",$line);
$existing_summs->{$file}=$sum;
if ( -s $file ) {
print IMGFPO $line;
} else {
print "Ignoring $line";
}
}
close IMGFP;
close IMGFPO;
unlink "$OPT{'dbfile'}" if -e "$OPT{'dbfile'}";
rename "$OPT{'dbfile'}.sik","$OPT{'dbfile'}";
open IMGFP, ">>$OPT{'dbfile'}";
} else {
open IMGFP, ">$OPT{'dbfile'}";
}
# put tree into array
my @list;
if ( $OPT{'include-hidden'} ) {
@list = split "\n", $list;
} else {
@list =
grep { $_ !~ m,/\., }
split "\n", $list;
}
# max value for our lame little statusbar
my $numfiles = $#list + 1;
# traverse the array.
foreach $file (@list) {
$curfile++;
next if $existing_summs->{$file};
# erase current screen line
system("tput el") if (!defined $OPT{'guimode'});
# build lame little status bar
if (defined $OPT{'guimode'}) {
$outputline = "Status::" . sprintf("%04d", $curfile) . "::" .
sprintf("%04d", $numfiles) . "::" .
sprintf("%03.2f", ($curfile/$numfiles) * 100);
print "$outputline\n";
} else {
$outputline = "[" . sprintf("%04d", $curfile) . "/" . sprintf("%04d", $numfiles)
. "] " . sprintf("%2.0f%%", ($curfile/$numfiles) * 100)
. statusbar($curfile, $numfiles) . "100% ";
$outputline .= substr($file, 0, $cols - length($outputline) - 1);
print "$outputline\n";
}
# move cursor back up a line
# I couldn't just use \r - it only updated like every 15 iterations
# apparently the linux console only refreshes on a \n
system("tput cuu1") unless defined $OPT{'guimode'} || defined $OPT{'verbose'};
# check what file thinks the file is
$format = '';
$filetype = `file "$file"`;
# # check for file types imagemagick is stupid about, which is
# # basically anything but a bitmap image
#
# unless ($file =~ /\.txt$/i || $file =~ /\.html$/i ||
# $filetype =~ /zip/i || $filetype =~ /mp3/i ||
# $filetype =~ /link/i || $filetype =~ /rpm/i ||
# $filetype =~ /execut/i || $filetype =~ /socket/i ||
# $filetype =~ /pipe/i || $filetype =~ /postscript/i ||
# $filetype =~ /pdf/i || $filetype =~ /mpeg/i ||
# $file =~ /\.man$/i ||
# $filetype =~ /text/i || $file =~ /\.htm$/i ) {
# or we could just assume the user has a good magic file.
# Of course imagemagick will still crap out on some animated GIFs.
# imagedups-db.txt =~ /image/ ..., so we want to avoid that.
if ($filetype =~ /\:.*image/i || $filetype =~ /\:.*bitmap/i) {
# Adjusted script to work with both old and new perlmagick
# Old perlmagick returns a CSV string, new returns an array
my @pingstring = $image->Ping($file);
if(@pingstring) {
if ($pingstring[0] =~ /,/) {
@pingstring = split (',', $pingstring[0]);
}
$format = $pingstring[3];
} # else leave $format == ''
}
# oh yeah, and just in case a text file slips through (crash!!)
if ($format ne '' && $format ne 'TXT') {
$img = &getfingerprint($image, $file);
# quote percents and colons in our db file.
$filename = $file;
$filename =~ s/\%/\%25/g;
$filename =~ s/\:/\%3A/g;
# only save if the image made a valid pbm.
if (defined($img) && length($img) > 0) {
print IMGFP "$filename:";
for (my $i = 0; $i < length($img); $i++) {
# convert each byte of pbm to a hex pair.
print IMGFP sprintf("%02x", ord(substr($img,$i,1)));
}
print IMGFP "\n";
} elsif (!defined($img)) {
warn "warning: unable to get fingerprint of $file\n";
}
}
}
close IMGFP;
print "\n" if !(defined $OPT{'guimode'});
}
# find dupes
if (defined $OPT{'file1'}) {
# do file1 and file2
$image = Image::Magick->new;
my $fp1 = getfingerprint($image, $OPT{'file1'})
or die "fatal: unable to get fingerprint of $OPT{'file1'}\n";
my $fp2 = getfingerprint($image, $OPT{'file2'})
or die "fatal: unable to get fingerprint of $OPT{'file2'}\n";
# xor the two binary strings to find differences
$fpdiff = $fp1 ^ $fp2;
# print fingerprints if -p specified.
if ($OPT{'printfp'}) {
my $i;
print "$OPT{'file1'}:";
for (my $i = 0; $i < length($fp1); $i++) {
# convert each byte of pbm to a hex pair.
print sprintf("%02x", ord(substr($fp1,$i,1)));
}
print "\n";
print "$OPT{'file2'}:";
for (my $i = 0; $i < length($fp2); $i++) {
# convert each byte of pbm to a hex pair.
print sprintf("%02x", ord(substr($fp2,$i,1)));
}
print "\n";
print "Difference:";
for (my $i = 0; $i < length($fpdiff); $i++) {
# convert each byte of pbm to a hex pair.
print sprintf("%02x", ord(substr($fpdiff,$i,1)));
}
print "\n";
}
# how many bits are different? number and %
$diffbits = countbits($fpdiff);
$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);
if (defined $OPT{'guimode'}) {
print "Dupe::$OPT{'file1'}::$OPT{'file2'}::$diffpct\n";
} else {
print "$OPT{'file1'} $OPT{'file2'}: seem to be $diffpct\% similar.\n";
}
# launch the viewer if the user wanted us to
if (defined($OPT{'viewpgm'})) {
system("$OPT{'viewpgm'} $key &");
system("$OPT{'viewpgm'} $keys[$j] &");
print "Press enter when done viewing. " if !(defined $OPT{'guimode'});
<STDIN>;
}
} else {
# do whole tree
open IMGFP, "<$OPT{'dbfile'}";
print "Finding duplicates in $OPT{'scandir'}, threshold $OPT{'threshold'}%.\n" if !(defined $OPT{'guimode'});
# load db into hash
while ($line = <IMGFP>) {
chop $line;
($key,$fp) = split(":",$line);
# remember, : and % are escaped
$key =~ s/\%3A/\:/g;
$key =~ s/\%25/\%/g;
$PFP{$key} = pack("H*", $fp);
}
@keys = keys %PFP;
my $i = 0;
my $bits_that_can_differ = 256 * (1 - $OPT{'threshold'} / 100 );
# traverse the hash
foreach $key (@keys) {
# generate lame little status bar
if (defined $OPT{'guimode'}) {
$outputline = "Status::" . sprintf("%04d", $i) . "::" .
sprintf("%04d", $#keys) . "::" .
sprintf("%03.2f", ($i/$#keys) * 100);
print "$outputline\n";
} else {
print "[" . sprintf("%04d", $i) . "/" . sprintf("%04d", $#keys) .
"] 0%" . statusbar($i, $#keys) . "100%\n";
}
# move the cursor up a line, see -rescan section
system("tput cuu1") if (!defined $OPT{'guimode'});
# check remainder of hash for close matches
for ( $j = $i + 1; $j <= $#keys; $j++) {
# read pbm data for both entries and unhex
my $fp1 = $PFP{$key};
my $fp2 = $PFP{$keys[$j]};
# xor the two binary strings to find differences
$fpdiff = $fp1 ^ $fp2;
# how many bits are different? number and %
$diffbits = countbits($fpdiff);
if ($diffbits <= $bits_that_can_differ) {
$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);
# blank line, we're going to tell the user something
if (defined $OPT{'guimode'}) {
print "Dupe:\:$key:\:$keys[$j]:\:$diffpct\n";
} else {
system("tput el");
print "$key $keys[$j]: seem to be $diffpct\% similar.\n";
}
# originally this was a log, now it writes out the gqvfile
&difflog($key) if not defined($ALREADYDIFF{$key});
&difflog($keys[$j]) if not defined($ALREADYDIFF{$keys[$j]});
# and makes sure to only write out each file once
$ALREADYDIFF{$key} = 1;
$ALREADYDIFF{$keys[$j]} = 1;
# launch the viewer if the user wanted us to
if (defined($OPT{'viewpgm'})) {
system("$OPT{'viewpgm'} $key &");
system("$OPT{'viewpgm'} $keys[$j] &");
print "Press enter when done viewing. ";
<STDIN>;
}
}
}
$i++;
}
# write out end of gqvfile, dunno if it's required but GQView does it
if (defined $OPT{'gqvfile'}) {
open GQV, ">>$OPT{'gqvfile'}";
print GQV "#end\n";
close GQV;
}
# close db file
close IMGFP;
print "\n";
}
undef $image;
sub base2 {
# base2: converts binary string to list of 1's and 0's not unlike
# pbm used to provide in text mode
my $inval = shift;
my $outval;
for (my $i = 0; $i < length($inval); $i++) {
for (my $j = 7; $j >= 0; $j--) {
$outval .= (ord(substr($inval,$i,1)) and (2 ** $j) ? 1 : 0);
}
}
$outval;
}
sub _countbits {
# countbits: counts the 1 bits in a binary string (doesn't use base2)
my $inval = shift;
my $outval = 0;
for (my $i = 0; $i < length($inval); $i++) {
for (my $j = 7; $j >= 0; $j--) {
my $bit = (ord(substr($inval,$i,1)) & (2 ** $j) ? 1 : 0);
$outval += $bit;
}
}
$outval;
}
sub countbits {
my $inval = shift;
my $outval = 0;
for (my $i = 0; $i < length($inval); $i++) {
$outval += $countbits_arr[ord(substr($inval,$i,1))];
}
$outval;
}
sub statusbar {
# statusbar: prints 0 to 50 dots based on $cur/$fin (arg0/arg1)
my $cur = shift;
my $fin = shift;
my $dots = int(($cur/$fin)*50);
my $blks = 50 - $dots;
my $outline = ("." x ($dots)) . (" " x $blks);
$outline;
}
sub debuglog {
# debuglog: writes to debug log.
my $arg = shift;
open DEBUGLOG, ">>findimagedupes-debug.txt";
print DEBUGLOG "$arg\n";
close DEBUGLOG;
}
sub difflog {
# difflog: used to be a debug thing, now handles the gqvfile output.
return undef if not defined($OPT{'gqvfile'});
my $arg = shift;
$arg =~ s/^\.\//$OPT{'curdir'}\//;
open DIFFLOG, ">>$OPT{'gqvfile'}";
print DIFFLOG qq^"$arg"\n^;
close DIFFLOG;
}
sub getfingerprint {
# here's a good a place as any to document the algorithm. it's not
# so much an algorithm as a philosophy, it's kind of too lame to be
# an algorithm. suggestions for improvement are very welcome.
# 1. read file.
# 2. standardize size by resampling to 160x160.
# 3. grayscale it. (reducing saturation seems faster than quantize.)
# 4. blur it a lot. (gets rid of noise. we're going down 10x more anyway)
# adding this nudges down false dupes about 10% and makes marginal
# dupes (e.g. big gamma difference) show up about 10% higher.
# 5. spread the intensity out as much as possible (normalize.)
# 6. make it as contrasty as possible (equalize.)
# this is for those real dark pictures that someone has slapped
# a pure white logo on. yes, i tested this thoroughly on pr0n!
# 7. resample again down to 16x16. I wanted to use a mosaic/pixelate
# kind of thing but hopefully imagemagick's resample function works
# roughly the same way.
# 8. reduce to 1bpp (threshold using defaults)
# 9. convert to pbm, er, um, raw mono
# 10. save out to database as hex string containing raw image data
# 11. when comparing, convert each file pair's thumbprints back to
# binary and xor them.
# 12. count the 1 bits in the result to approximate similarity.
my $image = shift;
my $file = shift;
my (@blobs, $img);
$SIG{SEGV} = \&handleSEGV;
my $result = eval {
my $error = $image->Read($file);
if ( $error ) {
warn "Could not read Image $file: '$error'\n";
return undef;
};
$#$image = 0;
$image->Sample("160x160!");
$image->Modulate(saturation=>-100);
$image->Blur(radius=>5);
$image->Normalize();
$image->Equalize();
$image->Sample("16x16");
$image->Threshold();
$image->Set(magick=>'mono');
@blobs = $image->ImageToBlob();
if(not defined ($blobs[0])) {
warn("got undefined blobs for $file\n");
} else {
# we used to discard the pbm header, but now we use raw mono
# so we'll discard all but the first 32 bytes
$img = substr($blobs[0],0,32);
}
};
$SIG{SEGV} = 'DEFAULT';
# free image but don't delete object.
undef @$image;
(defined $result) ? $img: undef;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment