Skip to content

Instantly share code, notes, and snippets.

@s1037989
Created April 20, 2018 12:48
Show Gist options
  • Save s1037989/e31b12e3d36d09bf58a6944fb9a1f799 to your computer and use it in GitHub Desktop.
Save s1037989/e31b12e3d36d09bf58a6944fb9a1f799 to your computer and use it in GitHub Desktop.
This script deduplicates files based on md5sum. Wherever there are copies of a file this script removes the copies and replaces them with hardlinks
use String::Truncate qw(elide);
my $_md5sum = '';
my $_file;
my $c = 0;
# TODO: what Linux command generates the "md5\tpath" file
open MD5, 'md5sum-sort.txt';
while ( local $_ = <MD5> ) {
chomp;
my ($md5sum, $file) = split /\s+/, $_, 2;
if ( $md5sum ne $_md5sum ) {
$c++ and next unless -e $file && -f _;
$_md5sum = $md5sum;
$_file = $file;
my $nlink = (stat($_file))[3];
printf "> %s (%s) %s %s\n", elide($_file, 100, { truncate => 'middle' }), $nlink, ' ' x (100 - length($_file) > 0 ? (100 - length($_file)) : 0), $_md5sum;
} elsif ( $file && $_file ) {
$c++ and next if ! -f $file || (stat($file))[3] > 1;
printf " (%s) unlink %s\n", (stat($_file))[3], elide($file, 100, { truncate => 'middle' });
unlink $file or print "!! unlink error $!:\n!! $file\n";
printf " (%s) link %s, %s\n", (stat($_file))[3], elide($_file, 50, { truncate => 'middle' }), elide($file, 50, { truncate => 'middle' });
link $_file, $file or print "!! link error $!:\n!! $_file => $file\n";
} else {
warn "Should never get here\n";
}
last if $ARGV[0] && ++$c >= $ARGV[0];
}
close MD5;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment