milkersarac · August 23, 2013 12:41
diff --git a/findimagedupes.pl b/findimagedupes.pl
 #!/usr/bin/perl -w

 # findimagedupes (name to be changed later)
 # copyright 2001 rob kudla
 # licensed under the GNU Public License version 2.0 or later
 # 
 # strengths: recognizes similar pictures with 98% accuracy when
 #            the pictures actually have unique features; generates
 #            collection files for easy managing of dupes with
 #            gqview (hopefully pixie someday)
 # weaknesses: lots of false positives on contact sheets and
 #             shots of things like ocean horizons, which all 
 #             reduce to basically this:
 #
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             1111111111111111
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000
 #             0000000000000000; slows down geometrically with
 #             larger image collections (>18 hours for 25144
 #             images, ~15 minutes for 2500)
 #
 # i've tried moving to an 8x8x8 bit array (twice the size but allows
 # greyscale comparisons) and it actually causes more false positives,
 # probably because 8x8 pixels just provides too little detail.

 use Image::Magick;
 #use strict;
 use warnings;
 use File::Basename;

 my $VERSION = "0.1.3";
 my $prog = $0 . '';
 $prog = substr($prog,rindex($prog,'/') + 1) if rindex($prog,'/') >= 0;

 my $image;

 sub handleSEGV { die "caught segfault in getfingerprint()\n"; };

 # check args
 my %OPT;
 while (my $arg = shift) {
    if ($arg eq '-rescan') {
 	$OPT{'rescan'} = 1;
    } elsif ($arg eq '-update') {
 	$OPT{'update'} = 1;
    } elsif ($arg eq '-t') {
 	$OPT{'threshold'} = shift;
    } elsif ($arg eq '-f') {
 	$OPT{'dbfile'} = shift;
    } elsif ($arg eq '-?' || $arg eq '-h' || $arg eq '--help') {
 	$OPT{'help'} = 1;
    } elsif ($arg eq '-d') {
 	$OPT{'scandir'} = shift;
    } elsif ($arg eq '-v') {
 	$OPT{'viewpgm'} = shift;
    } elsif ($arg eq '-c') {
 	$OPT{'gqvfile'} = shift;
    } elsif ($arg eq '-p') {
 	$OPT{'printfp'} = 1;
    } elsif ($arg eq '-verbose') {
 	$OPT{'verbose'} = 1;
    } elsif ($arg eq '-include-hidden') {
 	$OPT{'include-hidden'} = 1;
    } elsif ($arg eq '-g') {
 	$OPT{'guimode'} = 1;
    } else { # assume it's a filename
 	if (!defined($OPT{'file1'})) {
 	    $OPT{'file1'} = $arg;
 	} else {
 	    $OPT{'file2'} = $arg;
 	}
    }
 }

 # slap user if only one filename specified
 $OPT{'help'} = 1 if (defined $OPT{'file1'} && !defined $OPT{'file2'});

 # print help message if needed

 if ($OPT{'help'}) {
    print "$prog - Copyright 2001 Rob Kudla - http://www.kudla.org/raindog
 This program is distributed under the terms of the GNU Public License;
 see the file COPYING for details.

 Usage: $prog [options] [<file1> <file2>]
 Options:
       -rescan         = rescan fingerprints of all files in directory
       -update         = like rescan, but do not read Files already in DB File
                         and erase Entries not found any longer in Filesystem
       -f <file>       = use <file> as image fingerprint database
       -d <dir>        = scan <dir> instead of current directory
       -t <num>        = use <num> as threshold% of similarity (default 90)
       -v <program>    = launch <program> (in bg) to view each set of dupes
       -c <file>       = create GQView collection <file>.gqv of duplicates
       <file1> <file2> = diff just those two files, using -v if present
                         (other options ignored if files are specified)
       -p              = only valid when files specified; prints the
                         hex of the actual fingerprint of each file.
       -g              = GUI mode: produce only machine-friendly output.
       -verbose        = Verbose Output\n";
    exit 0;
 }

 # set up defaults

 $OPT{'scandir'} = '.' unless defined $OPT{'scandir'};
 $OPT{'dbfile'} = 'imagedupes-db.txt' unless defined $OPT{'dbfile'};
 $OPT{'dbfile'} = "$OPT{'scandir'}/$OPT{'dbfile'}" unless $OPT{'dbfile'} =~ "/";
 $OPT{'threshold'} = 90 unless defined $OPT{'threshold'} && $OPT{'threshold'} > 0;
 $OPT{'curdir'} = `pwd`;
 chop $OPT{'curdir'};
 $OPT{'update'} ||=0;
 $OPT{'rescan'} ||=0;
 $OPT{'include-hidden'} ||= 0;

 # set up gqvfile if needed

 if (defined $OPT{'gqvfile'} && defined $OPT{'file1'}) {
    $OPT{'gqvfile'} = "$OPT{'gqvfile'}.gqv" if $OPT{'gqvfile'} !~ /\.gqv$/;
    open GQV, ">$OPT{'gqvfile'}";
    print GQV "#GQView collection\n#Created with $prog version $VERSION\n";
    close GQV;
 }

 # set up countbits array

 my @countbits_arr;
 for (my $i = 0; $i < 256; $i++) {
    $countbits_arr[$i] = _countbits (chr($i));
 }

 # get columns if we can

 my $cols;
 if (!defined $OPT{'guimode'}) {
    $cols = `tput cols`;
    $cols += 0;
    $cols = 80 if $cols == 0;
 }

 # scan files in if the user wants or there's no database now

 if ( $OPT{'rescan'}>0 || $OPT{'update'}>0 || 
     ( (!-e $OPT{'dbfile'}) || (-s $OPT{'dbfile'} == 0)) && 
     !defined $OPT{'file1'} ) {

    print "Scanning fingerprints from $OPT{'scandir'} into $OPT{'dbfile'}.\n"
 	if !(defined $OPT{'guimode'});

    # get whole tree
    my $list = `find $OPT{'scandir'} -type f | sort`;
    # TODO: We should use File::Find here

    # get imagemagick object
    $image = Image::Magick->new;

    my $existing_summs={};
    # create dbfile
    if ( $OPT{'update'} ) {
 	open IMGFP, "<$OPT{'dbfile'}";
 	open IMGFPO, ">$OPT{'dbfile'}.sik";
 	while (my $line = <IMGFP> ) {
 	    my ($file,$sum)=split(":",$line);
 	    $existing_summs->{$file}=$sum;
 	    if ( -s $file ) {
 		print IMGFPO $line;
 	    } else {
 		print "Ignoring $line";
 	    }
 	}
 	close IMGFP;
 	close IMGFPO;
 	unlink "$OPT{'dbfile'}" if -e "$OPT{'dbfile'}";
 	rename "$OPT{'dbfile'}.sik","$OPT{'dbfile'}";
 	open IMGFP, ">>$OPT{'dbfile'}";
    } else {
 	open IMGFP, ">$OPT{'dbfile'}";
    }

    # put tree into array
    my @list;
    if ( $OPT{'include-hidden'} ) {
 	@list = split "\n", $list;
    } else {
 	@list = 
 	    grep { $_ !~ m,/\., }
 	    split "\n", $list;
    }

    # max value for our lame little statusbar
    my $numfiles = $#list + 1;

    # traverse the array.
    foreach $file (@list) {

 	$curfile++;
 	next if $existing_summs->{$file};

 	# erase current screen line
 	system("tput el") if (!defined $OPT{'guimode'}); 

 	# build lame little status bar
 	if (defined $OPT{'guimode'}) {
 	    $outputline = "Status::" . sprintf("%04d", $curfile) . "::" . 
 		sprintf("%04d", $numfiles) . "::" . 
 		    sprintf("%03.2f", ($curfile/$numfiles) * 100);
 	    print "$outputline\n";
 	} else {
 	    $outputline = "[" . sprintf("%04d", $curfile) . "/" . sprintf("%04d", $numfiles) 
 		. "] " . sprintf("%2.0f%%", ($curfile/$numfiles) * 100)
 		. statusbar($curfile, $numfiles) . "100% ";
 	    $outputline .= substr($file, 0, $cols - length($outputline) - 1);
 	    print "$outputline\n";
 	}

 	# move cursor back up a line
 	# I couldn't just use \r - it only updated like every 15 iterations
 	# apparently the linux console only refreshes on a \n
 	system("tput cuu1") unless defined $OPT{'guimode'} || defined $OPT{'verbose'}; 

 	# check what file thinks the file is
 	$format = '';
 	$filetype = `file "$file"`;

 #	# check for file types imagemagick is stupid about, which is
 #	# basically anything but a bitmap image
 #
 #  	unless ($file =~ /\.txt$/i || $file =~ /\.html$/i ||
 #  		$filetype =~ /zip/i || $filetype =~ /mp3/i ||
 #  		$filetype =~ /link/i || $filetype =~ /rpm/i ||
 #  		$filetype =~ /execut/i || $filetype =~ /socket/i ||
 #  		$filetype =~ /pipe/i || $filetype =~ /postscript/i ||
 #  		$filetype =~ /pdf/i || $filetype =~ /mpeg/i ||
 #  		$file =~ /\.man$/i ||
 #  		$filetype =~ /text/i || $file =~ /\.htm$/i ) {

 	# or we could just assume the user has a good magic file.
 	# Of course imagemagick will still crap out on some animated GIFs.

 	# imagedups-db.txt =~ /image/ ..., so we want to avoid that.
 	if ($filetype =~ /\:.*image/i || $filetype =~ /\:.*bitmap/i) {

 	    # Adjusted script to work with both old and new perlmagick
 	    # Old perlmagick returns a CSV string, new returns an array

 		my @pingstring = $image->Ping($file);
 		if(@pingstring) {
 		    if ($pingstring[0] =~ /,/) {
 			@pingstring = split (',', $pingstring[0]);
 		    }
                    $format = $pingstring[3];
 		} # else leave $format == ''
 	}
 	
 	# oh yeah, and just in case a text file slips through (crash!!)
 	if ($format ne '' && $format ne 'TXT') {

 	    $img = &getfingerprint($image, $file);

 	    # quote percents and colons in our db file.
 	    $filename = $file;
 	    $filename =~ s/\%/\%25/g;
 	    $filename =~ s/\:/\%3A/g;

 	    # only save if the image made a valid pbm.
 	    if (defined($img) && length($img) > 0) {
 		print IMGFP "$filename:";
 		for (my $i = 0; $i < length($img); $i++) {
 		    # convert each byte of pbm to a hex pair.
 		    print IMGFP sprintf("%02x", ord(substr($img,$i,1)));
 		}
 		print IMGFP "\n";
 	  } elsif (!defined($img)) {
 		warn "warning: unable to get fingerprint of $file\n";
 	  }

 	}
    }

    close IMGFP;
    print "\n" if !(defined $OPT{'guimode'});

 }

 # find dupes

 if (defined $OPT{'file1'}) {

 # do file1 and file2

    $image = Image::Magick->new;

    my $fp1 = getfingerprint($image, $OPT{'file1'})
 	or die "fatal: unable to get fingerprint of $OPT{'file1'}\n";
    my $fp2 = getfingerprint($image, $OPT{'file2'})
 	or die "fatal: unable to get fingerprint of $OPT{'file2'}\n";
    # xor the two binary strings to find differences
    $fpdiff = $fp1 ^ $fp2;

    # print fingerprints if -p specified.
    if ($OPT{'printfp'}) {
 	my $i;
 	print "$OPT{'file1'}:";
 	for (my $i = 0; $i < length($fp1); $i++) {
 	    # convert each byte of pbm to a hex pair.
 	    print sprintf("%02x", ord(substr($fp1,$i,1)));
 	}
 	print "\n";
 	print "$OPT{'file2'}:";
 	for (my $i = 0; $i < length($fp2); $i++) {
 	    # convert each byte of pbm to a hex pair.
 	    print sprintf("%02x", ord(substr($fp2,$i,1)));
 	}
 	print "\n";

 	print "Difference:";
 	for (my $i = 0; $i < length($fpdiff); $i++) {
 	    # convert each byte of pbm to a hex pair.
 	    print sprintf("%02x", ord(substr($fpdiff,$i,1)));
 	}
 	print "\n";

    }

    # how many bits are different?  number and %
    $diffbits = countbits($fpdiff);
    $diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);
    if (defined $OPT{'guimode'}) {
 	print "Dupe::$OPT{'file1'}::$OPT{'file2'}::$diffpct\n";
    } else {
 	print "$OPT{'file1'} $OPT{'file2'}: seem to be $diffpct\% similar.\n";
    }

    # launch the viewer if the user wanted us to
    if (defined($OPT{'viewpgm'})) {
 	system("$OPT{'viewpgm'} $key &");
 	system("$OPT{'viewpgm'} $keys[$j] &");
 	print "Press enter when done viewing. " if !(defined $OPT{'guimode'});
 	<STDIN>;
    }


 } else {

 # do whole tree

    open IMGFP, "<$OPT{'dbfile'}";
    print "Finding duplicates in $OPT{'scandir'}, threshold $OPT{'threshold'}%.\n" if !(defined $OPT{'guimode'});

 # load db into hash
    while ($line = <IMGFP>) {
 	chop $line;
 	($key,$fp) = split(":",$line);

 	# remember, : and % are escaped
 	$key =~ s/\%3A/\:/g;
 	$key =~ s/\%25/\%/g;

 	$PFP{$key} = pack("H*", $fp);
    }

    @keys = keys %PFP;

    my $i = 0;

    my $bits_that_can_differ = 256 * (1 - $OPT{'threshold'} / 100 );

 # traverse the hash
    foreach $key (@keys) { 
 	# generate lame little status bar
 	if (defined $OPT{'guimode'}) {
 	    $outputline = "Status::" . sprintf("%04d", $i) . "::" . 
 		sprintf("%04d", $#keys) . "::" . 
 		    sprintf("%03.2f", ($i/$#keys) * 100);
 	    print "$outputline\n";
 	} else {
 	    print "[" . sprintf("%04d", $i) . "/" . sprintf("%04d", $#keys) . 
 		"] 0%" . statusbar($i, $#keys) . "100%\n";
 	}

 	# move the cursor up a line, see -rescan section
 	system("tput cuu1") if (!defined $OPT{'guimode'});

 	# check remainder of hash for close matches
 	for ( $j = $i + 1; $j <= $#keys; $j++) {
 	    # read pbm data for both entries and unhex
 	    my $fp1 = $PFP{$key};
 	    my $fp2 = $PFP{$keys[$j]};

 	    # xor the two binary strings to find differences
 	    $fpdiff = $fp1 ^ $fp2;

 	    # how many bits are different?  number and %
 	    $diffbits = countbits($fpdiff);

 	    if ($diffbits <= $bits_that_can_differ) {
 		$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);

 		# blank line, we're going to tell the user something
 		if (defined $OPT{'guimode'}) {
 		    print "Dupe:\:$key:\:$keys[$j]:\:$diffpct\n";
 		} else {
 		    system("tput el");
 		    print "$key $keys[$j]: seem to be $diffpct\% similar.\n";
 		}

 		# originally this was a log, now it writes out the gqvfile
 		&difflog($key) if not defined($ALREADYDIFF{$key});
 		&difflog($keys[$j]) if not defined($ALREADYDIFF{$keys[$j]});

 		# and makes sure to only write out each file once
 		$ALREADYDIFF{$key} = 1;
 		$ALREADYDIFF{$keys[$j]} = 1;

 		# launch the viewer if the user wanted us to
 		if (defined($OPT{'viewpgm'})) {
 		    system("$OPT{'viewpgm'} $key &");
 		    system("$OPT{'viewpgm'} $keys[$j] &");
 		    print "Press enter when done viewing. ";
 		    <STDIN>;
 		}
 	    }
 	}

 	$i++;
    }

 # write out end of gqvfile, dunno if it's required but GQView does it
    if (defined $OPT{'gqvfile'}) {
 	open GQV, ">>$OPT{'gqvfile'}";
 	print GQV "#end\n";
 	close GQV;
    }

 # close db file
    close IMGFP;	
    print "\n";
 }

 undef $image;

 sub base2 {

    # base2: converts binary string to list of 1's and 0's not unlike 
    #        pbm used to provide in text mode

    my $inval = shift;
    my $outval;
    for (my $i = 0; $i < length($inval); $i++) {
 	for (my $j = 7; $j >= 0; $j--) {
 	    $outval .= (ord(substr($inval,$i,1)) and (2 ** $j) ? 1 : 0);
 	}
    }
    $outval;
 }

 sub _countbits {

    # countbits: counts the 1 bits in a binary string (doesn't use base2)

    my $inval = shift;
    my $outval = 0;    

    for (my $i = 0; $i < length($inval); $i++) {
 	for (my $j = 7; $j >= 0; $j--) {
 	    my $bit = (ord(substr($inval,$i,1)) & (2 ** $j) ? 1 : 0);
 	    $outval += $bit;
 	}
    }
    $outval;
 }

 sub countbits {
  my $inval = shift;
  my $outval = 0;

  for (my $i = 0; $i < length($inval); $i++) {
 	$outval += $countbits_arr[ord(substr($inval,$i,1))];
  }
  $outval;
 }

 sub statusbar {

    # statusbar: prints 0 to 50 dots based on $cur/$fin (arg0/arg1)

    my $cur = shift;
    my $fin = shift;
    my $dots = int(($cur/$fin)*50);
    my $blks = 50 - $dots;
    my $outline = ("." x ($dots)) . (" " x $blks);
    $outline;
 }

 sub debuglog {

    # debuglog: writes to debug log.

    my $arg = shift;
    open DEBUGLOG, ">>findimagedupes-debug.txt";
    print DEBUGLOG "$arg\n";
    close DEBUGLOG;
 }

 sub difflog {

    # difflog: used to be a debug thing, now handles the gqvfile output.

    return undef if not defined($OPT{'gqvfile'});
    my $arg = shift;
    $arg =~ s/^\.\//$OPT{'curdir'}\//;
    open DIFFLOG, ">>$OPT{'gqvfile'}";
    print DIFFLOG qq^"$arg"\n^;
    close DIFFLOG;
 }

 sub getfingerprint {

 #  here's a good a place as any to document the algorithm.  it's not
 #  so much an algorithm as a philosophy, it's kind of too lame to be
 #  an algorithm.  suggestions for improvement are very welcome.

 #  1. read file.
 #  2. standardize size by resampling to 160x160.
 #  3. grayscale it. (reducing saturation seems faster than quantize.)
 #  4. blur it a lot. (gets rid of noise.  we're going down 10x more anyway)
 #     adding this nudges down false dupes about 10% and makes marginal
 #     dupes (e.g. big gamma difference) show up about 10% higher.
 #  5. spread the intensity out as much as possible (normalize.)
 #  6. make it as contrasty as possible (equalize.)
 #     this is for those real dark pictures that someone has slapped
 #     a pure white logo on.  yes, i tested this thoroughly on pr0n!
 #  7. resample again down to 16x16.  I wanted to use a mosaic/pixelate
 #     kind of thing but hopefully imagemagick's resample function works
 #     roughly the same way.
 #  8. reduce to 1bpp (threshold using defaults)
 #  9. convert to pbm, er, um, raw mono
 #  10. save out to database as hex string containing raw image data
 #  11. when comparing, convert each file pair's thumbprints back to
 #      binary and xor them.
 #  12. count the 1 bits in the result to approximate similarity.

    my $image = shift;
    my $file = shift;
    my (@blobs, $img);

    $SIG{SEGV} = \&handleSEGV;
    my $result = eval {
 	my $error = $image->Read($file);
 	if ( $error ) {
 	    warn "Could not read Image $file: '$error'\n";
 	    return undef;
 	};

    $#$image = 0;
    $image->Sample("160x160!");
    $image->Modulate(saturation=>-100);
    $image->Blur(radius=>5);
    $image->Normalize();
    $image->Equalize();
    $image->Sample("16x16");
    $image->Threshold();
    $image->Set(magick=>'mono');
    @blobs = $image->ImageToBlob();
    if(not defined ($blobs[0])) {
 	warn("got undefined blobs for $file\n");
    } else {
 	# we used to discard the pbm header, but now we use raw mono
 	# so we'll discard all but the first 32 bytes
 	$img = substr($blobs[0],0,32);
    }
    };
    $SIG{SEGV} = 'DEFAULT';

    # free image but don't delete object.
    undef @$image;

    (defined $result) ? $img: undef;

 }
	#!/usr/bin/perl -w

	# findimagedupes (name to be changed later)
	# copyright 2001 rob kudla
	# licensed under the GNU Public License version 2.0 or later
	#
	# strengths: recognizes similar pictures with 98% accuracy when
	# the pictures actually have unique features; generates
	# collection files for easy managing of dupes with
	# gqview (hopefully pixie someday)
	# weaknesses: lots of false positives on contact sheets and
	# shots of things like ocean horizons, which all
	# reduce to basically this:
	#
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 1111111111111111
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000
	# 0000000000000000; slows down geometrically with
	# larger image collections (>18 hours for 25144
	# images, ~15 minutes for 2500)
	#
	# i've tried moving to an 8x8x8 bit array (twice the size but allows
	# greyscale comparisons) and it actually causes more false positives,
	# probably because 8x8 pixels just provides too little detail.

	use Image::Magick;
	#use strict;
	use warnings;
	use File::Basename;

	my $VERSION = "0.1.3";
	my $prog = $0 . '';
	$prog = substr($prog,rindex($prog,'/') + 1) if rindex($prog,'/') >= 0;

	my $image;

	sub handleSEGV { die "caught segfault in getfingerprint()\n"; };

	# check args
	my %OPT;
	while (my $arg = shift) {
	if ($arg eq '-rescan') {
	$OPT{'rescan'} = 1;
	} elsif ($arg eq '-update') {
	$OPT{'update'} = 1;
	} elsif ($arg eq '-t') {
	$OPT{'threshold'} = shift;
	} elsif ($arg eq '-f') {
	$OPT{'dbfile'} = shift;
	} elsif ($arg eq '-?' \|\| $arg eq '-h' \|\| $arg eq '--help') {
	$OPT{'help'} = 1;
	} elsif ($arg eq '-d') {
	$OPT{'scandir'} = shift;
	} elsif ($arg eq '-v') {
	$OPT{'viewpgm'} = shift;
	} elsif ($arg eq '-c') {
	$OPT{'gqvfile'} = shift;
	} elsif ($arg eq '-p') {
	$OPT{'printfp'} = 1;
	} elsif ($arg eq '-verbose') {
	$OPT{'verbose'} = 1;
	} elsif ($arg eq '-include-hidden') {
	$OPT{'include-hidden'} = 1;
	} elsif ($arg eq '-g') {
	$OPT{'guimode'} = 1;
	} else { # assume it's a filename
	if (!defined($OPT{'file1'})) {
	$OPT{'file1'} = $arg;
	} else {
	$OPT{'file2'} = $arg;
	}
	}
	}

	# slap user if only one filename specified
	$OPT{'help'} = 1 if (defined $OPT{'file1'} && !defined $OPT{'file2'});

	# print help message if needed

	if ($OPT{'help'}) {
	print "$prog - Copyright 2001 Rob Kudla - http://www.kudla.org/raindog
	This program is distributed under the terms of the GNU Public License;
	see the file COPYING for details.

	Usage: $prog [options] [<file1> <file2>]
	Options:
	-rescan = rescan fingerprints of all files in directory
	-update = like rescan, but do not read Files already in DB File
	and erase Entries not found any longer in Filesystem
	-f <file> = use <file> as image fingerprint database
	-d <dir> = scan <dir> instead of current directory
	-t <num> = use <num> as threshold% of similarity (default 90)
	-v <program> = launch <program> (in bg) to view each set of dupes
	-c <file> = create GQView collection <file>.gqv of duplicates
	<file1> <file2> = diff just those two files, using -v if present
	(other options ignored if files are specified)
	-p = only valid when files specified; prints the
	hex of the actual fingerprint of each file.
	-g = GUI mode: produce only machine-friendly output.
	-verbose = Verbose Output\n";
	exit 0;
	}

	# set up defaults

	$OPT{'scandir'} = '.' unless defined $OPT{'scandir'};
	$OPT{'dbfile'} = 'imagedupes-db.txt' unless defined $OPT{'dbfile'};
	$OPT{'dbfile'} = "$OPT{'scandir'}/$OPT{'dbfile'}" unless $OPT{'dbfile'} =~ "/";
	$OPT{'threshold'} = 90 unless defined $OPT{'threshold'} && $OPT{'threshold'} > 0;
	$OPT{'curdir'} = `pwd`;
	chop $OPT{'curdir'};
	$OPT{'update'} \|\|=0;
	$OPT{'rescan'} \|\|=0;
	$OPT{'include-hidden'} \|\|= 0;

	# set up gqvfile if needed

	if (defined $OPT{'gqvfile'} && defined $OPT{'file1'}) {
	$OPT{'gqvfile'} = "$OPT{'gqvfile'}.gqv" if $OPT{'gqvfile'} !~ /\.gqv$/;
	open GQV, ">$OPT{'gqvfile'}";
	print GQV "#GQView collection\n#Created with $prog version $VERSION\n";
	close GQV;
	}

	# set up countbits array

	my @countbits_arr;
	for (my $i = 0; $i < 256; $i++) {
	$countbits_arr[$i] = _countbits (chr($i));
	}

	# get columns if we can

	my $cols;
	if (!defined $OPT{'guimode'}) {
	$cols = `tput cols`;
	$cols += 0;
	$cols = 80 if $cols == 0;
	}

	# scan files in if the user wants or there's no database now

	if ( $OPT{'rescan'}>0 \|\| $OPT{'update'}>0 \|\|
	( (!-e $OPT{'dbfile'}) \|\| (-s $OPT{'dbfile'} == 0)) &&
	!defined $OPT{'file1'} ) {

	print "Scanning fingerprints from $OPT{'scandir'} into $OPT{'dbfile'}.\n"
	if !(defined $OPT{'guimode'});

	# get whole tree
	my $list = `find $OPT{'scandir'} -type f \| sort`;
	# TODO: We should use File::Find here

	# get imagemagick object
	$image = Image::Magick->new;

	my $existing_summs={};
	# create dbfile
	if ( $OPT{'update'} ) {
	open IMGFP, "<$OPT{'dbfile'}";
	open IMGFPO, ">$OPT{'dbfile'}.sik";
	while (my $line = <IMGFP> ) {
	my ($file,$sum)=split(":",$line);
	$existing_summs->{$file}=$sum;
	if ( -s $file ) {
	print IMGFPO $line;
	} else {
	print "Ignoring $line";
	}
	}
	close IMGFP;
	close IMGFPO;
	unlink "$OPT{'dbfile'}" if -e "$OPT{'dbfile'}";
	rename "$OPT{'dbfile'}.sik","$OPT{'dbfile'}";
	open IMGFP, ">>$OPT{'dbfile'}";
	} else {
	open IMGFP, ">$OPT{'dbfile'}";
	}

	# put tree into array
	my @list;
	if ( $OPT{'include-hidden'} ) {
	@list = split "\n", $list;
	} else {
	@list =
	grep { $_ !~ m,/\., }
	split "\n", $list;
	}

	# max value for our lame little statusbar
	my $numfiles = $#list + 1;

	# traverse the array.
	foreach $file (@list) {

	$curfile++;
	next if $existing_summs->{$file};

	# erase current screen line
	system("tput el") if (!defined $OPT{'guimode'});

	# build lame little status bar
	if (defined $OPT{'guimode'}) {
	$outputline = "Status::" . sprintf("%04d", $curfile) . "::" .
	sprintf("%04d", $numfiles) . "::" .
	sprintf("%03.2f", ($curfile/$numfiles) * 100);
	print "$outputline\n";
	} else {
	$outputline = "[" . sprintf("%04d", $curfile) . "/" . sprintf("%04d", $numfiles)
	. "] " . sprintf("%2.0f%%", ($curfile/$numfiles) * 100)
	. statusbar($curfile, $numfiles) . "100% ";
	$outputline .= substr($file, 0, $cols - length($outputline) - 1);
	print "$outputline\n";
	}

	# move cursor back up a line
	# I couldn't just use \r - it only updated like every 15 iterations
	# apparently the linux console only refreshes on a \n
	system("tput cuu1") unless defined $OPT{'guimode'} \|\| defined $OPT{'verbose'};

	# check what file thinks the file is
	$format = '';
	$filetype = `file "$file"`;

	# # check for file types imagemagick is stupid about, which is
	# # basically anything but a bitmap image
	#
	# unless ($file =~ /\.txt$/i \|\| $file =~ /\.html$/i \|\|
	# $filetype =~ /zip/i \|\| $filetype =~ /mp3/i \|\|
	# $filetype =~ /link/i \|\| $filetype =~ /rpm/i \|\|
	# $filetype =~ /execut/i \|\| $filetype =~ /socket/i \|\|
	# $filetype =~ /pipe/i \|\| $filetype =~ /postscript/i \|\|
	# $filetype =~ /pdf/i \|\| $filetype =~ /mpeg/i \|\|
	# $file =~ /\.man$/i \|\|
	# $filetype =~ /text/i \|\| $file =~ /\.htm$/i ) {

	# or we could just assume the user has a good magic file.
	# Of course imagemagick will still crap out on some animated GIFs.

	# imagedups-db.txt =~ /image/ ..., so we want to avoid that.
	if ($filetype =~ /\:.image/i \|\| $filetype =~ /\:.bitmap/i) {

	# Adjusted script to work with both old and new perlmagick
	# Old perlmagick returns a CSV string, new returns an array

	my @pingstring = $image->Ping($file);
	if(@pingstring) {
	if ($pingstring[0] =~ /,/) {
	@pingstring = split (',', $pingstring[0]);
	}
	$format = $pingstring[3];
	} # else leave $format == ''
	}

	# oh yeah, and just in case a text file slips through (crash!!)
	if ($format ne '' && $format ne 'TXT') {

	$img = &getfingerprint($image, $file);

	# quote percents and colons in our db file.
	$filename = $file;
	$filename =~ s/\%/\%25/g;
	$filename =~ s/\:/\%3A/g;

	# only save if the image made a valid pbm.
	if (defined($img) && length($img) > 0) {
	print IMGFP "$filename:";
	for (my $i = 0; $i < length($img); $i++) {
	# convert each byte of pbm to a hex pair.
	print IMGFP sprintf("%02x", ord(substr($img,$i,1)));
	}
	print IMGFP "\n";
	} elsif (!defined($img)) {
	warn "warning: unable to get fingerprint of $file\n";
	}

	}
	}

	close IMGFP;
	print "\n" if !(defined $OPT{'guimode'});

	}

	# find dupes

	if (defined $OPT{'file1'}) {

	# do file1 and file2

	$image = Image::Magick->new;

	my $fp1 = getfingerprint($image, $OPT{'file1'})
	or die "fatal: unable to get fingerprint of $OPT{'file1'}\n";
	my $fp2 = getfingerprint($image, $OPT{'file2'})
	or die "fatal: unable to get fingerprint of $OPT{'file2'}\n";
	# xor the two binary strings to find differences
	$fpdiff = $fp1 ^ $fp2;

	# print fingerprints if -p specified.
	if ($OPT{'printfp'}) {
	my $i;
	print "$OPT{'file1'}:";
	for (my $i = 0; $i < length($fp1); $i++) {
	# convert each byte of pbm to a hex pair.
	print sprintf("%02x", ord(substr($fp1,$i,1)));
	}
	print "\n";
	print "$OPT{'file2'}:";
	for (my $i = 0; $i < length($fp2); $i++) {
	# convert each byte of pbm to a hex pair.
	print sprintf("%02x", ord(substr($fp2,$i,1)));
	}
	print "\n";

	print "Difference:";
	for (my $i = 0; $i < length($fpdiff); $i++) {
	# convert each byte of pbm to a hex pair.
	print sprintf("%02x", ord(substr($fpdiff,$i,1)));
	}
	print "\n";

	}

	# how many bits are different? number and %
	$diffbits = countbits($fpdiff);
	$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);
	if (defined $OPT{'guimode'}) {
	print "Dupe::$OPT{'file1'}::$OPT{'file2'}::$diffpct\n";
	} else {
	print "$OPT{'file1'} $OPT{'file2'}: seem to be $diffpct\% similar.\n";
	}

	# launch the viewer if the user wanted us to
	if (defined($OPT{'viewpgm'})) {
	system("$OPT{'viewpgm'} $key &");
	system("$OPT{'viewpgm'} $keys[$j] &");
	print "Press enter when done viewing. " if !(defined $OPT{'guimode'});
	<STDIN>;
	}


	} else {

	# do whole tree

	open IMGFP, "<$OPT{'dbfile'}";
	print "Finding duplicates in $OPT{'scandir'}, threshold $OPT{'threshold'}%.\n" if !(defined $OPT{'guimode'});

	# load db into hash
	while ($line = <IMGFP>) {
	chop $line;
	($key,$fp) = split(":",$line);

	# remember, : and % are escaped
	$key =~ s/\%3A/\:/g;
	$key =~ s/\%25/\%/g;

	$PFP{$key} = pack("H*", $fp);
	}

	@keys = keys %PFP;

	my $i = 0;

	my $bits_that_can_differ = 256 * (1 - $OPT{'threshold'} / 100 );

	# traverse the hash
	foreach $key (@keys) {
	# generate lame little status bar
	if (defined $OPT{'guimode'}) {
	$outputline = "Status::" . sprintf("%04d", $i) . "::" .
	sprintf("%04d", $#keys) . "::" .
	sprintf("%03.2f", ($i/$#keys) * 100);
	print "$outputline\n";
	} else {
	print "[" . sprintf("%04d", $i) . "/" . sprintf("%04d", $#keys) .
	"] 0%" . statusbar($i, $#keys) . "100%\n";
	}

	# move the cursor up a line, see -rescan section
	system("tput cuu1") if (!defined $OPT{'guimode'});

	# check remainder of hash for close matches
	for ( $j = $i + 1; $j <= $#keys; $j++) {
	# read pbm data for both entries and unhex
	my $fp1 = $PFP{$key};
	my $fp2 = $PFP{$keys[$j]};

	# xor the two binary strings to find differences
	$fpdiff = $fp1 ^ $fp2;

	# how many bits are different? number and %
	$diffbits = countbits($fpdiff);

	if ($diffbits <= $bits_that_can_differ) {
	$diffpct = sprintf("%0.2f",(1-($diffbits/256))*100);

	# blank line, we're going to tell the user something
	if (defined $OPT{'guimode'}) {
	print "Dupe:\:$key:\:$keys[$j]:\:$diffpct\n";
	} else {
	system("tput el");
	print "$key $keys[$j]: seem to be $diffpct\% similar.\n";
	}

	# originally this was a log, now it writes out the gqvfile
	&difflog($key) if not defined($ALREADYDIFF{$key});
	&difflog($keys[$j]) if not defined($ALREADYDIFF{$keys[$j]});

	# and makes sure to only write out each file once
	$ALREADYDIFF{$key} = 1;
	$ALREADYDIFF{$keys[$j]} = 1;

	# launch the viewer if the user wanted us to
	if (defined($OPT{'viewpgm'})) {
	system("$OPT{'viewpgm'} $key &");
	system("$OPT{'viewpgm'} $keys[$j] &");
	print "Press enter when done viewing. ";
	<STDIN>;
	}
	}
	}

	$i++;
	}

	# write out end of gqvfile, dunno if it's required but GQView does it
	if (defined $OPT{'gqvfile'}) {
	open GQV, ">>$OPT{'gqvfile'}";
	print GQV "#end\n";
	close GQV;
	}

	# close db file
	close IMGFP;
	print "\n";
	}

	undef $image;

	sub base2 {

	# base2: converts binary string to list of 1's and 0's not unlike
	# pbm used to provide in text mode

	my $inval = shift;
	my $outval;
	for (my $i = 0; $i < length($inval); $i++) {
	for (my $j = 7; $j >= 0; $j--) {
	$outval .= (ord(substr($inval,$i,1)) and (2 ** $j) ? 1 : 0);
	}
	}
	$outval;
	}

	sub _countbits {

	# countbits: counts the 1 bits in a binary string (doesn't use base2)

	my $inval = shift;
	my $outval = 0;

	for (my $i = 0; $i < length($inval); $i++) {
	for (my $j = 7; $j >= 0; $j--) {
	my $bit = (ord(substr($inval,$i,1)) & (2 ** $j) ? 1 : 0);
	$outval += $bit;
	}
	}
	$outval;
	}

	sub countbits {
	my $inval = shift;
	my $outval = 0;

	for (my $i = 0; $i < length($inval); $i++) {
	$outval += $countbits_arr[ord(substr($inval,$i,1))];
	}
	$outval;
	}

	sub statusbar {

	# statusbar: prints 0 to 50 dots based on $cur/$fin (arg0/arg1)

	my $cur = shift;
	my $fin = shift;
	my $dots = int(($cur/$fin)*50);
	my $blks = 50 - $dots;
	my $outline = ("." x ($dots)) . (" " x $blks);
	$outline;
	}

	sub debuglog {

	# debuglog: writes to debug log.

	my $arg = shift;
	open DEBUGLOG, ">>findimagedupes-debug.txt";
	print DEBUGLOG "$arg\n";
	close DEBUGLOG;
	}

	sub difflog {

	# difflog: used to be a debug thing, now handles the gqvfile output.

	return undef if not defined($OPT{'gqvfile'});
	my $arg = shift;
	$arg =~ s/^\.\//$OPT{'curdir'}\//;
	open DIFFLOG, ">>$OPT{'gqvfile'}";
	print DIFFLOG qq^"$arg"\n^;
	close DIFFLOG;
	}

	sub getfingerprint {

	# here's a good a place as any to document the algorithm. it's not
	# so much an algorithm as a philosophy, it's kind of too lame to be
	# an algorithm. suggestions for improvement are very welcome.

	# 1. read file.
	# 2. standardize size by resampling to 160x160.
	# 3. grayscale it. (reducing saturation seems faster than quantize.)
	# 4. blur it a lot. (gets rid of noise. we're going down 10x more anyway)
	# adding this nudges down false dupes about 10% and makes marginal
	# dupes (e.g. big gamma difference) show up about 10% higher.
	# 5. spread the intensity out as much as possible (normalize.)
	# 6. make it as contrasty as possible (equalize.)
	# this is for those real dark pictures that someone has slapped
	# a pure white logo on. yes, i tested this thoroughly on pr0n!
	# 7. resample again down to 16x16. I wanted to use a mosaic/pixelate
	# kind of thing but hopefully imagemagick's resample function works
	# roughly the same way.
	# 8. reduce to 1bpp (threshold using defaults)
	# 9. convert to pbm, er, um, raw mono
	# 10. save out to database as hex string containing raw image data
	# 11. when comparing, convert each file pair's thumbprints back to
	# binary and xor them.
	# 12. count the 1 bits in the result to approximate similarity.

	my $image = shift;
	my $file = shift;
	my (@blobs, $img);

	$SIG{SEGV} = \&handleSEGV;
	my $result = eval {
	my $error = $image->Read($file);
	if ( $error ) {
	warn "Could not read Image $file: '$error'\n";
	return undef;
	};

	$#$image = 0;
	$image->Sample("160x160!");
	$image->Modulate(saturation=>-100);
	$image->Blur(radius=>5);
	$image->Normalize();
	$image->Equalize();
	$image->Sample("16x16");
	$image->Threshold();
	$image->Set(magick=>'mono');
	@blobs = $image->ImageToBlob();
	if(not defined ($blobs[0])) {
	warn("got undefined blobs for $file\n");
	} else {
	# we used to discard the pbm header, but now we use raw mono
	# so we'll discard all but the first 32 bytes
	$img = substr($blobs[0],0,32);
	}
	};
	$SIG{SEGV} = 'DEFAULT';

	# free image but don't delete object.
	undef @$image;

	(defined $result) ? $img: undef;

	}