Skip to content

Instantly share code, notes, and snippets.

@waffle2k
Last active December 31, 2015 13:39
Show Gist options
  • Save waffle2k/7994491 to your computer and use it in GitHub Desktop.
Save waffle2k/7994491 to your computer and use it in GitHub Desktop.
Rsync a remote dir locally, compare it against a previous backup, and replace all identical files with hardlinks to reduce the space used.
#!/usr/bin/perl
use strict;
package MAIN;
use Digest::MD5;
use File::Find;
use Data::Dumper;
my $VERBOSE = $ENV{'VERBOSE'} || 0;
sub rsync {
my $source = shift || die;
my $target = shift || die;
print "[I] [rsync -cav $source $target]\n" if $VERBOSE;
`rsync -cav $source $target`;
}
sub prep_dir {
my $d = shift;
`mkdir -p $d`;
}
sub date_format {
my $date_string = `date +%Y-%m-%d`;
chomp ($date_string);
return ($date_string);
}
sub prep_target_dir {
my $d = shift;
my @dirs = `find $d -type d`;
prep_dir ($_) for @dirs;
}
sub extract_leading_dir {
my $extract = shift;
my $from = shift;
$from =~ s/^$extract(.*)/$1/;
return ($from);
}
sub md5digest {
my $filename = shift;
if ( -f $filename && ( ! -d $filename ) ){
open my $fd, "<$filename";
binmode( $fd );
my $ctx = Digest::MD5->new;
$ctx->addfile( $fd );
my $digest = $ctx->hexdigest;
close $fd;
return $digest;
}
return undef;
}
sub map_filename_to_md5digest {
my $filename = shift;
return md5digest( $filename ) . "\t$filename";
}
sub md5sum {
my @dirs = grep { -d $_ } @_;
my @hashres = ();
find( sub {
push( @hashres, map_filename_to_md5digest( $File::Find::name ) ) unless -d $File::Find::name;
}, @dirs );
return @hashres;
}
my $source = shift || die "Please provide a source directory";
my $target = shift || die "Please provide a target directory";
my $rsync_cred = $ENV{'RSYNC'} or die 'Please provide the environment variable RSYNC with something like "[email protected]:"';
# Append slashes to the paths..
$source .= "/" unless ( $source =~ /\/$/ );
$target .= "/" unless ( $target =~ /\/$/ );
# Check if there's an existing dir that we'll
# seed from
my $previous_backup = `find $target -maxdepth 1 -type d | sort | tail -1`;
chomp ($previous_backup);
$previous_backup = '' unless ( $previous_backup =~ /\d{4}-\d{2}-\d{2}\\?$/ );
my $base_target = $target;
$target .= date_format();
# Don't clobber with duplicates
exit (0) if ( -d "$target" );
# Remove duplicate slashes
$source =~ s/\/+/\//g;
$target =~ s/\/+/\//g;
prep_dir ($target);
if ( -d "$previous_backup" ){
print "previous [$previous_backup] [$target]\n" if $VERBOSE;
if ($previous_backup ne $target){
print "Seeding from $previous_backup ..\n" if $VERBOSE;
rsync ($previous_backup . '/' => $target);
}
}
rsync( "--delete " . $rsync_cred . $source . "/" => $target );
# Now compare the last two backups, creating hardlinks between them
# for every file that's the same
my @results = `find $base_target -maxdepth 1 -type d | sort | tail -2`;
chomp( $_ ) for (@results);
if (scalar @results == 2){
if ( -d $results[0] and -d $results[1] ){
my $s = {};
my $t = {};
my @hashres = md5sum( $results[0] );
for (@hashres){
my ($hash,$filename) = split( /\s+/, $_ );
$filename = extract_leading_dir( $results[0] => $filename );
$s->{"$filename"} = $hash;
}
@hashres = md5sum( $results[1] );
for (@hashres){
my ($hash,$filename) = split( /\s+/, $_ );
$filename = extract_leading_dir( $results[1] => $filename );
$t->{"$filename"} = $hash;
}
# Go through every file in $t, and check if it also
# exists in $s, and if the hashes match. If so, then create
# a hardlink
for my $file (keys %$t){
if (defined $s->{"$file"} and ( $s->{"$file"} eq $t->{"$file"} ) ){
my ($sfn, $tfn) = ( $results[0] . "/$file", $results[1] . "/$file" );
$sfn =~ s,//,/,g;
$tfn =~ s,//,/,g;
if ( -f $sfn and -f $tfn ){
unlink $tfn;
print "Linking [$sfn] => [$tfn]\n" if $VERBOSE;
link( $sfn, $tfn ) or warn( $! );
}
}
}
}
}
exit (0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment