Created
February 11, 2014 11:54
-
-
Save jukworks/8933501 to your computer and use it in GitHub Desktop.
This Perl script fetches image and video files from the given URL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use autodie; | |
use WWW::Mechanize; | |
use Getopt::Long; | |
use URI; | |
use Math::Random qw/random_exponential/; | |
my $url = ''; | |
my $interval = 5; | |
my $save = 'fetched'; | |
my $test = ''; | |
# parse arguments | |
GetOptions( | |
"url=s" => \$url, | |
"interval=i" => \$interval, | |
"save=s" => \$save, | |
"test" => \$test, | |
) or usage(); | |
# check if the given URL is valid | |
unless ( ( URI->new( $url, "http" ) )->scheme() ) { | |
print STDERR "ERROR! Invalid URL: ", $url, "\n\n"; | |
usage(); | |
} | |
# let's get started | |
$url .= '/' unless $url =~ /.*\/$/; | |
print "Given URL: ", $url, "\n"; | |
print "Interval : ", $interval, "\n"; | |
print "Save Dir : ", $save, "\n"; | |
mkdir $save unless $test or -d $save; | |
my $mech = WWW::Mechanize->new; | |
$mech->show_progress(1); | |
$mech->agent_alias('Windows Mozilla'); | |
$mech->get($url); | |
my @links = $mech->links; | |
my @dir_queue = (); | |
my @file_queue = (); | |
while (1) { | |
for my $x (@links) { | |
if ( defined $x->url ) { | |
push @dir_queue, $x->base . $x->url | |
if $x->url !~ /^\// | |
&& $x->url !~ /^http/ | |
&& $x->url =~ /\/$/; | |
push @file_queue, $x->base . $x->url | |
if $x->url | |
=~ /.*(avi|bmp|gif|jpg|jpeg|mov|mp4|mpg|mpeg|png|zip)$/i; | |
} | |
} | |
my @delays | |
= Math::Random::random_exponential( scalar @file_queue, $interval ) | |
unless $test; | |
for my $target (@file_queue) { | |
my $save_to = $save . '/' . substr( $target, length($url) ); | |
print "Downloading: $target => $save_to\n"; | |
unless ($test) { | |
$mech->get($target); | |
$mech->save_content($save_to); | |
my $sleep_time = int( shift @delays ); | |
if ( $sleep_time > 0 ) { | |
print "Sleeping: ${sleep_time}s\n"; | |
sleep($sleep_time); | |
} | |
} | |
} | |
last unless @dir_queue; | |
my $next = shift @dir_queue; | |
unless ($test) { | |
my $local_dir = $save . '/' . substr( $next, length($url) ); | |
mkdir $local_dir unless -d $local_dir; | |
} | |
print "The next directory: ", $next, "\n"; | |
$mech->get($next); | |
@links = $mech->links; | |
@file_queue = (); | |
} | |
sub usage { | |
print <<HELLO; | |
Usage: fetch_all_files.pl -url [URL] -interval [Interval] -save [Save Directory] -test | |
-url : the target URL (REQUIRED) | |
-interval : the average time interval (second) (default: 5s) | |
-save : the directory name for storing files (default: fetched) | |
-test : the script will not download files (just print out the target files) | |
HELLO | |
exit; | |
} | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment