Created
October 1, 2017 12:33
-
-
Save STrRedWolf/77e7bb2cdcab35766e89917f4b18352e to your computer and use it in GitHub Desktop.
Deduplication tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/perl | |
#use Digest::FarmHash qw(farmhash64); | |
use File::Find; | |
use DBI; | |
use Digest::SHA qw(sha256_hex); | |
#use bigint; | |
$|=1; | |
# Connect... | |
my $dbh = DBI->connect("DBI:mysql:database=tygris:host=localhost", | |
"tygris","lunataur") || die "$!"; | |
$dbh->{RaiseError} = 1; | |
my $prefix=`pwd`; | |
chomp $prefix; | |
# Find what's in this directory. | |
#find(\&hashthis, "1.Import"); | |
find(\&hashthis, "00.Master"); | |
# main routine for each file... | |
sub hashthis | |
{ | |
return unless -f; | |
my $file = $_; | |
my $path = $File::Find::dir; | |
my @s = stat; | |
my $mtime = $s[9]; | |
my $fsize = $s[7]; | |
# Which file are we on... | |
my $filename=$File::Find::name; | |
# print $filename, "\n"; | |
# Is it already registered? | |
my $id; | |
my @row=$dbh->selectrow_array("SELECT fileid FROM dedupe_files WHERE name=? AND path=?", | |
undef, $file, $path); | |
if($row[0]) { | |
$dbh->do("UPDATE dedupe_files SET mtime=? WHERE fileid=?",undef,$row[0],$mtime); | |
print "Skip: $filename\n"; | |
return; | |
} | |
print $filename,"\n"; | |
# Now hash it. Do an overall hash. | |
my $bighash = Digest::SHA->new(256); | |
open(IN,"<",$file) || die "Can't open < $file: $!"; | |
my $buff; | |
while(read IN,$buff,4096*1024) # 4M chunks | |
{ | |
$bighash->add($buff); | |
} | |
# Now check if we hashed an exact dupe. | |
# Match it by file size and hash | |
@row=$dbh->selectrow_array("SELECT MIN(fileid) FROM dedupe_files WHERE size=? AND hash=?", | |
undef, $fsize, $bighash); | |
unless($row[0]) { # No collision... | |
$dbh->do("INSERT INTO dedupe_files (name,path,mtime,size,hash) VALUES (?,?,?,?,?)", | |
undef, $file, $path, $mtime, $fsize, $bighash); | |
# print "Reg'd: $filename\n"; | |
return; | |
} | |
# at this point, the file size and hash collided, so verify byte-by-byte. | |
my $srcid=$row[0]; | |
@row=$dbh->selectrow_array("SELECT fileid, name, path FROM dedupe_files WHERE fileid=?", | |
undef, $srcid); | |
my $srcfile=$prefix."/".$row[2]."/".$row[1]; | |
print "Likey dupe w/$srcfile\n"; | |
system('cmp','-s',$srcfile,$file); | |
if($? == -1) { | |
die "cmp failed: $!"; | |
} elsif($? & 127) { | |
die sprintf("child died w/signal %d, %s coredump", | |
$? & 127, ($? & 128) ? 'with' : 'without'); | |
} else { | |
my $rv=$? >> 8; | |
if($rv) { # cmp exit(1)'s when different, so skip. | |
print "### cmp says no match ($rv)\n"; | |
$dbh->do("INSERT INTO dedupe_files (name,path,mtime,size,hash) VALUES (?,?,?,?,?)", | |
undef, $file, $path, $mtime, $fsize, $bighash); | |
return; | |
} | |
} | |
# print "Exact match.\n"; | |
# It's an exact match. Ugh. Toss it. | |
# toss($id,$filename,$path); | |
print "DUPE: ",$filename,"\n"; | |
} | |
### Toss files away (into a dedicated trash folder. We'll nuke 'em later) | |
sub toss { | |
my $id=shift; | |
my $filename=shift; | |
my $path=shift; | |
my $dest='zz.Trash/'.$path; | |
system('mkdir','-p',$dest); | |
system('mv',$filename,$dest); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment