Skip to content

Instantly share code, notes, and snippets.

@STrRedWolf
Last active June 9, 2019 22:11
Show Gist options
  • Save STrRedWolf/bc909da9a72789c6137bddb2d1715442 to your computer and use it in GitHub Desktop.
Save STrRedWolf/bc909da9a72789c6137bddb2d1715442 to your computer and use it in GitHub Desktop.
Linux-side deduplication
#!/usr/bin/perl
# Duplicate finder. Moves dupes in what directories you've specified to a dedicated "Trash" directory for you to remove.
# Double-checks those MD5 hashes!
#
# Example of usage:
# find 00.Master -type f -print0 | xargs -0 md5sum > master.hash.txt
# ...etc... (repeat above line for other directories to dedupe against)
# cat *.hash.txt | sort | uniq -D -w 32 > dupes.txt
# perl cleardupes.pl dupes.txt | tee log.txt
# Your dupes are in 0.Trash
use strict;
use File::Basename;
$|=1;
my %hv;
mkdir "0.Trash" unless(-e "0.Trash");
while(<>) {
chomp;
next if(/^$/);
next if(/^#/);
next unless(/^(\S+)\s+(\S.+)$/);
my $h=$1;
my $f=$2;
unless(exists $hv{$h}) {
$hv{$h}=$f;
print "# First: $f\n";
next;
}
unless(-e $f) {
print "# Taken already: $f\n";
next;
}
# Likely:
print "# Likely: $f\n";
my $s=$hv{$h};
system('cmp','-s',$s,$f);
if($? == -1) {
die "cmp failed: $!";
} elsif($? & 127) {
die sprintf("child died w/signal %d, %s coredump",
$? & 127, ($? & 128) ? 'with' : 'without');
}
my $rv=$? >> 8;
if($rv) {
print "! Total compare fail (false positive):\n";
print "! Src: $s\n";
print "! Dst: $f\n";
next;
}
# Move it.
my $p=dirname($f);
print "> Extra: $f\n";
system "mkdir","-p","0.Trash/$p";
system "mv","$f","0.Trash/$p";
}
#!/bin/bash
while [ `find . -type d -empty | wc -l` -gt 0 ]; do
find . -type d -empty -print0 | xargs -0 rmdir
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment