Skip to content

Instantly share code, notes, and snippets.

@goerz
Created August 1, 2010 18:45
Show Gist options
  • Save goerz/503626 to your computer and use it in GitHub Desktop.
Save goerz/503626 to your computer and use it in GitHub Desktop.
deletedoubles.pl: Delete duplicate files, based on their md5 sum.
#!/usr/bin/perl -w
use strict;
my %files;
my @ignorepatterns = ();
my $defaultselection = 0;
my $recursive = 'true';
print "Please wait while we're collecting files";
get_files(\@ARGV);
print "\n";
foreach my $filelist (values(%files)){
if (@{$filelist} > 1){
my %md5files = ();
foreach my $file (@{$filelist}){
if (-f $file){
print ".";
my $md5line = readpipe("md5sum \"$file\"");
if ($md5line =~ /^([0-9a-fA-F]{32}) +(.*)$/){
push(@{$md5files{$1}}, $2);
}
foreach my $md5filelist (values(%md5files)){
if (@{$md5filelist} > 1){
ask_and_delete(checked_filelist($md5filelist), $defaultselection);
}
}
}
}
}
}
print "\n";
exit;
sub get_files{
my $filelist = shift;
foreach my $file (@{$filelist}){
if ( (-f $file) and not (inignorelist($file))){
my $filesize = -s $file;
if ($filesize > 0){
push(@{$files{$filesize}}, $file);
print '.'
}
}
if ( (-d $file) && ($recursive eq 'true') ){
my @recfilelist;
if ($file =~ /[ ]/){
@recfilelist = glob('"'."$file".'"/*');
} else {
@recfilelist = glob("$file/*");
}
get_files(\@recfilelist);
}
}
}
sub checked_filelist{
# standardize filename, check if all files in filelist exist, remove duplicates
my $filelist = shift;
my @result = ();
foreach my $item (@{$filelist}){
$item =~ s'//'/'g;
my $pass = 0;
if (-f $item){
$pass = 1;
foreach my $checkitem (@result){
$pass = 0 if ($item eq $checkitem);
}
}
push(@result, $item) if ($pass);
}
return \@result;
}
sub inignorelist{
my $file = shift;
foreach my $pattern (@ignorepatterns){
return 1 if $file =~ $pattern;
}
return 0;
}
sub ask_and_delete{
my $filelist = shift;
my $defaultselection = shift;
if (@{$filelist} > 1){
print "\nChoose which file to keep\n";
my $i;
print "[0]\tkeep all\n";
for ($i=1; $i<=@{$filelist}; $i++){
print "[$i]\t", $filelist->[$i-1], "\n";
}
my $selection = $defaultselection;
my $accepted = 0;
while (not $accepted){
print "> [$defaultselection] \t";
$selection = <STDIN>;
chomp $selection;
if ($selection eq ''){
$selection = $defaultselection;
$accepted = 1 ;
} elsif ($selection =~ /^[0-9]+$/){
if (($selection >= 0) and ($selection < $i)){
$accepted = 1;
}
}
}
if ($selection > 0){
for ($i=1; $i<=@{$filelist}; $i++){
my $file_to_delete = $filelist->[$i-1];
if ($i != $selection){
print "unlinking $file_to_delete\n";
unlink $file_to_delete;
} else {
print "keeping $file_to_delete\n";
}
}
} else {
print "keeping all\n";
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment