Last active
June 9, 2019 22:11
-
-
Save STrRedWolf/bc909da9a72789c6137bddb2d1715442 to your computer and use it in GitHub Desktop.
Linux-side deduplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Duplicate finder. Moves dupes in what directories you've specified to a dedicated "Trash" directory for you to remove. | |
# Double-checks those MD5 hashes! | |
# | |
# Example of usage: | |
# find 00.Master -type f -print0 | xargs -0 md5sum > master.hash.txt | |
# ...etc... (repeat above line for other directories to dedupe against) | |
# cat *.hash.txt | sort | uniq -D -w 32 > dupes.txt | |
# perl cleardupes.pl dupes.txt | tee log.txt | |
# Your dupes are in 0.Trash | |
use strict; | |
use File::Basename; | |
$|=1; | |
my %hv; | |
mkdir "0.Trash" unless(-e "0.Trash"); | |
while(<>) { | |
chomp; | |
next if(/^$/); | |
next if(/^#/); | |
next unless(/^(\S+)\s+(\S.+)$/); | |
my $h=$1; | |
my $f=$2; | |
unless(exists $hv{$h}) { | |
$hv{$h}=$f; | |
print "# First: $f\n"; | |
next; | |
} | |
unless(-e $f) { | |
print "# Taken already: $f\n"; | |
next; | |
} | |
# Likely: | |
print "# Likely: $f\n"; | |
my $s=$hv{$h}; | |
system('cmp','-s',$s,$f); | |
if($? == -1) { | |
die "cmp failed: $!"; | |
} elsif($? & 127) { | |
die sprintf("child died w/signal %d, %s coredump", | |
$? & 127, ($? & 128) ? 'with' : 'without'); | |
} | |
my $rv=$? >> 8; | |
if($rv) { | |
print "! Total compare fail (false positive):\n"; | |
print "! Src: $s\n"; | |
print "! Dst: $f\n"; | |
next; | |
} | |
# Move it. | |
my $p=dirname($f); | |
print "> Extra: $f\n"; | |
system "mkdir","-p","0.Trash/$p"; | |
system "mv","$f","0.Trash/$p"; | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
while [ `find . -type d -empty | wc -l` -gt 0 ]; do | |
find . -type d -empty -print0 | xargs -0 rmdir | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment