Last active
December 31, 2015 13:39
-
-
Save waffle2k/7994491 to your computer and use it in GitHub Desktop.
Rsync a remote dir locally, compare it against a previous backup, and replace all identical files with hardlinks to reduce the space used.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
package MAIN; | |
use Digest::MD5; | |
use File::Find; | |
use Data::Dumper; | |
my $VERBOSE = $ENV{'VERBOSE'} || 0; | |
sub rsync { | |
my $source = shift || die; | |
my $target = shift || die; | |
print "[I] [rsync -cav $source $target]\n" if $VERBOSE; | |
`rsync -cav $source $target`; | |
} | |
sub prep_dir { | |
my $d = shift; | |
`mkdir -p $d`; | |
} | |
sub date_format { | |
my $date_string = `date +%Y-%m-%d`; | |
chomp ($date_string); | |
return ($date_string); | |
} | |
sub prep_target_dir { | |
my $d = shift; | |
my @dirs = `find $d -type d`; | |
prep_dir ($_) for @dirs; | |
} | |
sub extract_leading_dir { | |
my $extract = shift; | |
my $from = shift; | |
$from =~ s/^$extract(.*)/$1/; | |
return ($from); | |
} | |
sub md5digest { | |
my $filename = shift; | |
if ( -f $filename && ( ! -d $filename ) ){ | |
open my $fd, "<$filename"; | |
binmode( $fd ); | |
my $ctx = Digest::MD5->new; | |
$ctx->addfile( $fd ); | |
my $digest = $ctx->hexdigest; | |
close $fd; | |
return $digest; | |
} | |
return undef; | |
} | |
sub map_filename_to_md5digest { | |
my $filename = shift; | |
return md5digest( $filename ) . "\t$filename"; | |
} | |
sub md5sum { | |
my @dirs = grep { -d $_ } @_; | |
my @hashres = (); | |
find( sub { | |
push( @hashres, map_filename_to_md5digest( $File::Find::name ) ) unless -d $File::Find::name; | |
}, @dirs ); | |
return @hashres; | |
} | |
my $source = shift || die "Please provide a source directory"; | |
my $target = shift || die "Please provide a target directory"; | |
my $rsync_cred = $ENV{'RSYNC'} or die 'Please provide the environment variable RSYNC with something like "[email protected]:"'; | |
# Append slashes to the paths.. | |
$source .= "/" unless ( $source =~ /\/$/ ); | |
$target .= "/" unless ( $target =~ /\/$/ ); | |
# Check if there's an existing dir that we'll | |
# seed from | |
my $previous_backup = `find $target -maxdepth 1 -type d | sort | tail -1`; | |
chomp ($previous_backup); | |
$previous_backup = '' unless ( $previous_backup =~ /\d{4}-\d{2}-\d{2}\\?$/ ); | |
my $base_target = $target; | |
$target .= date_format(); | |
# Don't clobber with duplicates | |
exit (0) if ( -d "$target" ); | |
# Remove duplicate slashes | |
$source =~ s/\/+/\//g; | |
$target =~ s/\/+/\//g; | |
prep_dir ($target); | |
if ( -d "$previous_backup" ){ | |
print "previous [$previous_backup] [$target]\n" if $VERBOSE; | |
if ($previous_backup ne $target){ | |
print "Seeding from $previous_backup ..\n" if $VERBOSE; | |
rsync ($previous_backup . '/' => $target); | |
} | |
} | |
rsync( "--delete " . $rsync_cred . $source . "/" => $target ); | |
# Now compare the last two backups, creating hardlinks between them | |
# for every file that's the same | |
my @results = `find $base_target -maxdepth 1 -type d | sort | tail -2`; | |
chomp( $_ ) for (@results); | |
if (scalar @results == 2){ | |
if ( -d $results[0] and -d $results[1] ){ | |
my $s = {}; | |
my $t = {}; | |
my @hashres = md5sum( $results[0] ); | |
for (@hashres){ | |
my ($hash,$filename) = split( /\s+/, $_ ); | |
$filename = extract_leading_dir( $results[0] => $filename ); | |
$s->{"$filename"} = $hash; | |
} | |
@hashres = md5sum( $results[1] ); | |
for (@hashres){ | |
my ($hash,$filename) = split( /\s+/, $_ ); | |
$filename = extract_leading_dir( $results[1] => $filename ); | |
$t->{"$filename"} = $hash; | |
} | |
# Go through every file in $t, and check if it also | |
# exists in $s, and if the hashes match. If so, then create | |
# a hardlink | |
for my $file (keys %$t){ | |
if (defined $s->{"$file"} and ( $s->{"$file"} eq $t->{"$file"} ) ){ | |
my ($sfn, $tfn) = ( $results[0] . "/$file", $results[1] . "/$file" ); | |
$sfn =~ s,//,/,g; | |
$tfn =~ s,//,/,g; | |
if ( -f $sfn and -f $tfn ){ | |
unlink $tfn; | |
print "Linking [$sfn] => [$tfn]\n" if $VERBOSE; | |
link( $sfn, $tfn ) or warn( $! ); | |
} | |
} | |
} | |
} | |
} | |
exit (0); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment