Last active
December 21, 2015 05:08
-
-
Save AwwCookies/6254402 to your computer and use it in GitHub Desktop.
Download all files on a website perl script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use Getopt::Std; | |
use HTML::Entities; | |
my @download_files = (); | |
my $error_log = ''; | |
my %opts; | |
my @extensions; | |
sub attempt_download($$$); | |
sub read_files($$); | |
sub print_error($); | |
sub add_slash($); | |
getopts("vhf:", \%opts); | |
if($opts{'h'}) { | |
print <<END; | |
Usage: $0 -f extensions site1 [site2] [site3] ... | |
-h Print this message | |
-f File extensions to search for (comma separated list) | |
-v Causes wget output to be displayed | |
-r Read from a file /* not added yet */ | |
END | |
exit(0); | |
} | |
unless(defined $opts{'f'}){ | |
print("At least one file extension is required (-f). See help (-h) for more information.\n"); | |
exit(1); | |
} | |
@extensions = split(/\s*,\s*/, $opts{'f'}); | |
print("STAGE 1: Finding files on the page.\n\n"); | |
foreach(@ARGV) { | |
print("Reading files from \"$_\"...\n"); | |
my $page = qx/wget -q -t 10 -O - $_/; | |
unless($?) { | |
my $found_count = read_files($page, $_); | |
print("Found ".$found_count." files.\n"); | |
} else { | |
print("Retrieving $_ failed. Skipping."); | |
} | |
} | |
print( "STAGE 2: Fetching files with the extensions provided.\n\n" ); | |
foreach my $file (@download_files) { | |
if(-e $file->{'file'}) { | |
print($file->{'file'}." (".$file->{'name'}.") already exists. Skipping.\n"); | |
next; | |
} | |
print("Downloading ".$file->{'file'}." (".$file->{'name'}.").\n"); | |
attempt_download($file->{'url'}, $file->{'file'}, $file->{'name'}); | |
} | |
if(length($error_log)) { | |
print("Completed with errors.\n\nERRORS:\n"); | |
print($error_log."\n"); | |
} else { | |
print("Completed with no errors.\n"); | |
} | |
sub attempt_download($$$){ | |
my $url = shift; | |
my $file_name = shift; | |
my $link_name = shift; | |
my $download_temp_name = "download_temp"; | |
# Try first the original URL | |
$_ = "wget ".( $opts{'v'} ? "" : "-q" )." -t 10 -O ".$download_temp_name." ".$url; | |
system( $_ ); | |
unless( -e $download_temp_name ) { | |
print_error("Unable to get file ".$file_name." (".$link_name.")"); | |
return(); | |
} | |
if( `file $download_temp_name` =~ /gzip/ ) { | |
system( "zcat ".$download_temp_name." > ".$file_name ); | |
if($?) { | |
print_error("Unable to unzip ".$file_name." (".$link_name.")"); | |
} | |
unlink($download_temp_name); | |
} else { | |
rename($download_temp_name, $file_name); | |
} | |
} | |
sub read_files($$) { | |
my $page = shift; | |
my $base_url = shift; | |
my $count = 0; | |
while($page=~/<a [^>]*?href=\"([^\"]+)\".*?>(.*?)<\/a>/gi) { | |
#print( "File $1, name $2, pos ".pos($page)."\n" ); | |
# Correct the web URL | |
my( $url_found, $title_found ) = ( $1, $2 ); | |
# Check to see if it's Javascript | |
next if( $url_found=~/^javascript:/ ); | |
# Weed out anchors | |
next if( index( $url_found, "#" )!=-1 ); | |
# Check to see if it's relative link | |
if($url_found!~/^\w+:\/\//) { | |
# Remove slash if there is one on the link | |
$url_found = substr( $url_found, 1 ) if( substr( $url_found, 0, 1 ) eq "/" ); | |
# Add a slash if there is none | |
$url_found = add_slash( $base_url ).$url_found; | |
} else { | |
# Now check to see if this isn't a file but a main webpage | |
next if( $url_found=~/^(\w+):\/\/[^\/]+\/?$/ ); | |
$url_found =~ /^(\w+):/; | |
my $protocol = lc( $1 ); | |
# Check to see if we can handle it | |
next unless($protocol eq "http" or $protocol eq "https" or $protocol eq "ftp"); | |
} | |
# Decode the URL | |
decode_entities($url_found); | |
# Get the filename | |
next unless($url_found =~ /\/([\w\-_\%\.]+\.([\w\-_\%]+))$/); | |
# Filename is contained in $1 | |
my $file_name = $1; | |
my $extension = $2; | |
# Check to see if we want this extension | |
next unless(grep $_ eq $extension, @extensions); | |
# Parse out % special vars | |
$file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; | |
decode_entities($file_name); | |
$count++; | |
push(@download_files, {'file'=>$file_name, 'name'=>$title_found, 'url'=>$url_found}); | |
} | |
return($count); | |
} | |
sub print_error($) { | |
print("ERROR: ".$_[0]); | |
$error_log.= "\n" if(length($error_log)); | |
$error_log.= $_[0]; | |
} | |
sub add_slash($) { | |
if (substr($_[0], -1) ne "/") { | |
return($_[ 0 ]."/"); | |
} | |
return($_[ 0 ]); | |
} | |
################################ | |
# Original by Mike A. Leonetti # | |
# http://bit.ly/19yz7uz # | |
################################ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage
Examples
Would download all RAR files to the current directory from http://thissite.com.
Would download both ZIP and ISO files to the current directory from http://distrosite.com.
How it Works
The script detects files in a tags. For example:
Gets found with the regex pattern and downloaded with wget. Then it checks if the file needs to be gunzipped and unzips it if necessary. This also means the script will not detect image tags and download images. It attempts to clean up improper characters that are sometimes found on sites as well as makes sure the URLs are proper.
Original
by Mike A. Leonetti http://bit.ly/19yz7uz