Created
August 1, 2010 18:45
-
-
Save goerz/503626 to your computer and use it in GitHub Desktop.
deletedoubles.pl: Delete duplicate files, based on their md5 sum.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
my %files; | |
my @ignorepatterns = (); | |
my $defaultselection = 0; | |
my $recursive = 'true'; | |
print "Please wait while we're collecting files"; | |
get_files(\@ARGV); | |
print "\n"; | |
foreach my $filelist (values(%files)){ | |
if (@{$filelist} > 1){ | |
my %md5files = (); | |
foreach my $file (@{$filelist}){ | |
if (-f $file){ | |
print "."; | |
my $md5line = readpipe("md5sum \"$file\""); | |
if ($md5line =~ /^([0-9a-fA-F]{32}) +(.*)$/){ | |
push(@{$md5files{$1}}, $2); | |
} | |
foreach my $md5filelist (values(%md5files)){ | |
if (@{$md5filelist} > 1){ | |
ask_and_delete(checked_filelist($md5filelist), $defaultselection); | |
} | |
} | |
} | |
} | |
} | |
} | |
print "\n"; | |
exit; | |
sub get_files{ | |
my $filelist = shift; | |
foreach my $file (@{$filelist}){ | |
if ( (-f $file) and not (inignorelist($file))){ | |
my $filesize = -s $file; | |
if ($filesize > 0){ | |
push(@{$files{$filesize}}, $file); | |
print '.' | |
} | |
} | |
if ( (-d $file) && ($recursive eq 'true') ){ | |
my @recfilelist; | |
if ($file =~ /[ ]/){ | |
@recfilelist = glob('"'."$file".'"/*'); | |
} else { | |
@recfilelist = glob("$file/*"); | |
} | |
get_files(\@recfilelist); | |
} | |
} | |
} | |
sub checked_filelist{ | |
# standardize filename, check if all files in filelist exist, remove duplicates | |
my $filelist = shift; | |
my @result = (); | |
foreach my $item (@{$filelist}){ | |
$item =~ s'//'/'g; | |
my $pass = 0; | |
if (-f $item){ | |
$pass = 1; | |
foreach my $checkitem (@result){ | |
$pass = 0 if ($item eq $checkitem); | |
} | |
} | |
push(@result, $item) if ($pass); | |
} | |
return \@result; | |
} | |
sub inignorelist{ | |
my $file = shift; | |
foreach my $pattern (@ignorepatterns){ | |
return 1 if $file =~ $pattern; | |
} | |
return 0; | |
} | |
sub ask_and_delete{ | |
my $filelist = shift; | |
my $defaultselection = shift; | |
if (@{$filelist} > 1){ | |
print "\nChoose which file to keep\n"; | |
my $i; | |
print "[0]\tkeep all\n"; | |
for ($i=1; $i<=@{$filelist}; $i++){ | |
print "[$i]\t", $filelist->[$i-1], "\n"; | |
} | |
my $selection = $defaultselection; | |
my $accepted = 0; | |
while (not $accepted){ | |
print "> [$defaultselection] \t"; | |
$selection = <STDIN>; | |
chomp $selection; | |
if ($selection eq ''){ | |
$selection = $defaultselection; | |
$accepted = 1 ; | |
} elsif ($selection =~ /^[0-9]+$/){ | |
if (($selection >= 0) and ($selection < $i)){ | |
$accepted = 1; | |
} | |
} | |
} | |
if ($selection > 0){ | |
for ($i=1; $i<=@{$filelist}; $i++){ | |
my $file_to_delete = $filelist->[$i-1]; | |
if ($i != $selection){ | |
print "unlinking $file_to_delete\n"; | |
unlink $file_to_delete; | |
} else { | |
print "keeping $file_to_delete\n"; | |
} | |
} | |
} else { | |
print "keeping all\n"; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment