allenday · September 3, 2018 10:07 · erandagan1000 · Sep 3, 2018
diff --git a/make_TsvHttpData.pl b/make_TsvHttpData.pl
 #!/usr/bin/perl
 use strict;

 my $http_base = shift;
 my $path_base = shift;
 my $prev_file = shift;

 if ( ! $http_base || ! $path_base ) {
  print STDERR <<"HERE";
 USAGE:
 $0 <HTTP base URL> <Filesystem base path> [<Previous output file>]

 EXAMPLE:
 $0 http://my.hostname.org/ ~/public_html/sync/ ~/TsvHttpData.pre

 SYNOPSIS:
 This script is used to generate a 'TsvHttpData' URL list for Google Cloud
 Storage Transfer Service, see:

  https://cloud.google.com/storage/transfer/create-url-list

 TsvHttpData file is tab-delimited and contains 3 columns:

 * URL
 * Object size, in bytes
 * Base64 encoded MD5 checksum of object

 MD5 checksumming is I/O intense, and to improve efficience for repeated runs
 of this script, we take advantage of each file's modification time to determine
 if the previously calculated MD5 checksum can be reused, or needs to be
 recalculated.  As such, output format contains 4 columns:

 * URL
 * Object size, in bytes
 * Base64 encoded MD5 checksum of object
 * Object modification time, in seconds since Unix epoch

 TsvHttpData format can be created from this script's output like:

  cat ~/TsvHttpData.pre | cut -f 1,2,3 > ~/public_html/TsvHttpData.tsv
 HERE

  exit(1);
 }
 if ( ! -d $path_base ) {
  die "Not a directory: $path_base";
 }
 if ( ! -f $prev_file ) {
  print STDERR "no previous file";
 }

 #recover previously cached object modification times
 #and MD5 checksums
 my %ent = ();
 open( P, $prev_file );
 <P>; #skip header
 while ( my $line = <P> ) {
  chomp $line;
  my ( $url, $size, $md5, $mod ) = split /\t/, $line;
  my $path = $url;
  $path =~ s/$http_base/$path_base/;

  $ent{ $path } = [ $url, $size, $md5, $mod ];
 }

 print qq(TsvHttpData-1.0\n);

 #iterate over files currently in path to sync.
 #reuse cached data if it exists and isn't stale.
 foreach my $path ( `find $path_base -type f` ) {
  chomp $path;

  #modification time
  my $m = `stat -c %Y "$path"`;
  chomp $m;

  if ( $ent{ $path } && $m == $ent{ $path }[3] ) {
    print join "\t", $ent{ $path }[0], $ent{ $path }[1], $ent{ $path }[2], $ent{ $path }[3];
    print "\n";
    next;
  }

  my $o = $path;
  $o =~ s/$path_base//;
  $o = $http_base . $o;

  #TODO make escaping more robust.
  $o =~ s/%/%25/g;

  #size in bytes
  my $s = -s $path;

  #base64 encoded md5 checksum
  my $h = `openssl md5 -binary "$path" | openssl enc -base64`;
  chomp $h;

  print "$o\t$s\t$h\t$m\n";
 }
	#!/usr/bin/perl
	use strict;

	my $http_base = shift;
	my $path_base = shift;
	my $prev_file = shift;

	if ( ! $http_base \|\| ! $path_base ) {
	print STDERR <<"HERE";
	USAGE:
	$0 <HTTP base URL> <Filesystem base path> [<Previous output file>]

	EXAMPLE:
	$0 http://my.hostname.org/ ~/public_html/sync/ ~/TsvHttpData.pre

	SYNOPSIS:
	This script is used to generate a 'TsvHttpData' URL list for Google Cloud
	Storage Transfer Service, see:

	https://cloud.google.com/storage/transfer/create-url-list

	TsvHttpData file is tab-delimited and contains 3 columns:

	* URL
	* Object size, in bytes
	* Base64 encoded MD5 checksum of object

	MD5 checksumming is I/O intense, and to improve efficience for repeated runs
	of this script, we take advantage of each file's modification time to determine
	if the previously calculated MD5 checksum can be reused, or needs to be
	recalculated. As such, output format contains 4 columns:

	* URL
	* Object size, in bytes
	* Base64 encoded MD5 checksum of object
	* Object modification time, in seconds since Unix epoch

	TsvHttpData format can be created from this script's output like:

	cat ~/TsvHttpData.pre \| cut -f 1,2,3 > ~/public_html/TsvHttpData.tsv
	HERE

	exit(1);
	}
	if ( ! -d $path_base ) {
	die "Not a directory: $path_base";
	}
	if ( ! -f $prev_file ) {
	print STDERR "no previous file";
	}

	#recover previously cached object modification times
	#and MD5 checksums
	my %ent = ();
	open( P, $prev_file );
	<P>; #skip header
	while ( my $line = <P> ) {
	chomp $line;
	my ( $url, $size, $md5, $mod ) = split /\t/, $line;
	my $path = $url;
	$path =~ s/$http_base/$path_base/;

	$ent{ $path } = [ $url, $size, $md5, $mod ];
	}

	print qq(TsvHttpData-1.0\n);

	#iterate over files currently in path to sync.
	#reuse cached data if it exists and isn't stale.
	foreach my $path ( `find $path_base -type f` ) {
	chomp $path;

	#modification time
	my $m = `stat -c %Y "$path"`;
	chomp $m;

	if ( $ent{ $path } && $m == $ent{ $path }[3] ) {
	print join "\t", $ent{ $path }[0], $ent{ $path }[1], $ent{ $path }[2], $ent{ $path }[3];
	print "\n";
	next;
	}

	my $o = $path;
	$o =~ s/$path_base//;
	$o = $http_base . $o;

	#TODO make escaping more robust.
	$o =~ s/%/%25/g;

	#size in bytes
	my $s = -s $path;

	#base64 encoded md5 checksum
	my $h = `openssl md5 -binary "$path" \| openssl enc -base64`;
	chomp $h;

	print "$o\t$s\t$h\t$m\n";
	}