Created
April 12, 2018 17:56
-
-
Save dginev/4350548637f11648d5b930730faa39a1 to your computer and use it in GitHub Desktop.
update_arxiv.pl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# expected: s3cmd version 1.6.1 , properly configured | |
# Run this script from the directory with your existing arXiv download, e.g. /data/arXiv/ | |
use strict; | |
use warnings; | |
# Note: you need s3cmd properly setup on your machine first! | |
my $available = ` s3cmd ls --requester-pays s3://arxiv/src/`; | |
my @available_tars = grep {defined} map {/(s3\:.+\.tar$)/; $1;} split("\n",$available); | |
# Obtain already downloaded URLs: | |
opendir my $cdir, '.'; | |
my %downloaded_tars = map {("s3://arxiv/src/$_" => 1)} grep {/\.tar$/} readdir($cdir); | |
closedir $cdir; | |
my @new_tars = grep {!$downloaded_tars{$_};} @available_tars; | |
foreach my $new_tar(sort @new_tars) { | |
print "Fetching: $new_tar\n"; | |
`s3cmd get --requester-pays $new_tar`; } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment