Last active
June 19, 2021 14:16
-
-
Save putnamhill/8740809 to your computer and use it in GitHub Desktop.
Print groups of files that are duplicates.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use Digest::MD5; | |
#use diagnostics; | |
my $minimum = 0; | |
my $header = ''; | |
my $footer = ''; | |
my $group_open_a = 'duplicate files (md5: '; | |
my $group_open_b = ')'; | |
my $group_close = ''; | |
my $file_open = ''; | |
my $file_close = ''; | |
my %md5_hash=(); | |
my $help; | |
my $xml; | |
GetOptions( | |
'help' => \$help, | |
'h' => \$help, | |
'minimum=i' => \$minimum, | |
'm=i' => \$minimum, | |
'xml' => \$xml, | |
'x' => \$xml | |
); | |
if (defined $help) { | |
print <<'EOT'; | |
Usage: group-dupes.pl [options] file1 file2 ... | |
Print groups of files that are duplicates. | |
If no files are passed on the command line, files are read from stdin (tip: feed with find). | |
Anything that is not a regular file is ignored. | |
Options: | |
-m, --minimum minimum number of duplicates to print, default is 1 | |
-x, --xml print as xml | |
-h, --help print this message | |
EOT | |
exit; | |
} | |
if (defined $xml) { | |
$header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<duplicates>\n"; | |
$footer = "</duplicates>\n"; | |
$group_open_a = '<group md5="'; $group_open_b = '">'; | |
$group_close = "</group>\n"; | |
$file_open = '<file>'; | |
$file_close = '</file>'; | |
} | |
if ($#ARGV > -1) { | |
while ($#ARGV > -1) { # process every file on the command line | |
group_dupe($ARGV[0]); | |
shift; | |
} | |
} else { | |
while (<>) { # read from standard in if there's nothing on the command line | |
chomp; | |
group_dupe($_); | |
} | |
} | |
print $header; | |
foreach my $md5 (keys %md5_hash) { | |
my @files = @{$md5_hash{$md5}}; | |
if ($#files > $minimum) { | |
print "$group_open_a$md5$group_open_b\n"; | |
for my $file (@files) { | |
print "\t$file_open$file$file_close\n"; | |
} | |
print $group_close; | |
} | |
} | |
print $footer; | |
sub group_dupe { | |
my ($path) = @_; | |
(! -f $path) && return; # skip anything that's not a regular file | |
if (-r $path) { | |
open(FILE, '<', $path) or die "Can't open $path: $!"; | |
binmode(FILE); | |
if (my $digest = Digest::MD5->new->addfile(*FILE)->hexdigest, $path) { | |
push(@{$md5_hash{$digest}}, $path); | |
} else { | |
print STDERR "Can't make md5 digest of file $path: $!\n"; | |
} | |
} else { | |
print STDERR "Can't read file: $path ... skipping\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment