Skip to content

Instantly share code, notes, and snippets.

@AwwCookies
Last active December 21, 2015 05:08
Show Gist options
  • Save AwwCookies/6254402 to your computer and use it in GitHub Desktop.
Save AwwCookies/6254402 to your computer and use it in GitHub Desktop.
Download all files on a website perl script
#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use HTML::Entities;
my @download_files = ();
my $error_log = '';
my %opts;
my @extensions;
sub attempt_download($$$);
sub read_files($$);
sub print_error($);
sub add_slash($);
getopts("vhf:", \%opts);
if($opts{'h'}) {
print <<END;
Usage: $0 -f extensions site1 [site2] [site3] ...
-h Print this message
-f File extensions to search for (comma separated list)
-v Causes wget output to be displayed
-r Read from a file /* not added yet */
END
exit(0);
}
unless(defined $opts{'f'}){
print("At least one file extension is required (-f). See help (-h) for more information.\n");
exit(1);
}
@extensions = split(/\s*,\s*/, $opts{'f'});
print("STAGE 1: Finding files on the page.\n\n");
foreach(@ARGV) {
print("Reading files from \"$_\"...\n");
my $page = qx/wget -q -t 10 -O - $_/;
unless($?) {
my $found_count = read_files($page, $_);
print("Found ".$found_count." files.\n");
} else {
print("Retrieving $_ failed. Skipping.");
}
}
print( "STAGE 2: Fetching files with the extensions provided.\n\n" );
foreach my $file (@download_files) {
if(-e $file->{'file'}) {
print($file->{'file'}." (".$file->{'name'}.") already exists. Skipping.\n");
next;
}
print("Downloading ".$file->{'file'}." (".$file->{'name'}.").\n");
attempt_download($file->{'url'}, $file->{'file'}, $file->{'name'});
}
if(length($error_log)) {
print("Completed with errors.\n\nERRORS:\n");
print($error_log."\n");
} else {
print("Completed with no errors.\n");
}
sub attempt_download($$$){
my $url = shift;
my $file_name = shift;
my $link_name = shift;
my $download_temp_name = "download_temp";
# Try first the original URL
$_ = "wget ".( $opts{'v'} ? "" : "-q" )." -t 10 -O ".$download_temp_name." ".$url;
system( $_ );
unless( -e $download_temp_name ) {
print_error("Unable to get file ".$file_name." (".$link_name.")");
return();
}
if( `file $download_temp_name` =~ /gzip/ ) {
system( "zcat ".$download_temp_name." > ".$file_name );
if($?) {
print_error("Unable to unzip ".$file_name." (".$link_name.")");
}
unlink($download_temp_name);
} else {
rename($download_temp_name, $file_name);
}
}
sub read_files($$) {
my $page = shift;
my $base_url = shift;
my $count = 0;
while($page=~/<a [^>]*?href=\"([^\"]+)\".*?>(.*?)<\/a>/gi) {
#print( "File $1, name $2, pos ".pos($page)."\n" );
# Correct the web URL
my( $url_found, $title_found ) = ( $1, $2 );
# Check to see if it's Javascript
next if( $url_found=~/^javascript:/ );
# Weed out anchors
next if( index( $url_found, "#" )!=-1 );
# Check to see if it's relative link
if($url_found!~/^\w+:\/\//) {
# Remove slash if there is one on the link
$url_found = substr( $url_found, 1 ) if( substr( $url_found, 0, 1 ) eq "/" );
# Add a slash if there is none
$url_found = add_slash( $base_url ).$url_found;
} else {
# Now check to see if this isn't a file but a main webpage
next if( $url_found=~/^(\w+):\/\/[^\/]+\/?$/ );
$url_found =~ /^(\w+):/;
my $protocol = lc( $1 );
# Check to see if we can handle it
next unless($protocol eq "http" or $protocol eq "https" or $protocol eq "ftp");
}
# Decode the URL
decode_entities($url_found);
# Get the filename
next unless($url_found =~ /\/([\w\-_\%\.]+\.([\w\-_\%]+))$/);
# Filename is contained in $1
my $file_name = $1;
my $extension = $2;
# Check to see if we want this extension
next unless(grep $_ eq $extension, @extensions);
# Parse out % special vars
$file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
decode_entities($file_name);
$count++;
push(@download_files, {'file'=>$file_name, 'name'=>$title_found, 'url'=>$url_found});
}
return($count);
}
sub print_error($) {
print("ERROR: ".$_[0]);
$error_log.= "\n" if(length($error_log));
$error_log.= $_[0];
}
sub add_slash($) {
if (substr($_[0], -1) ne "/") {
return($_[ 0 ]."/");
}
return($_[ 0 ]);
}
################################
# Original by Mike A. Leonetti #
# http://bit.ly/19yz7uz #
################################
@AwwCookies
Copy link
Author

Usage

Usage: ./reapfiles -f extensions site1 [site2] [site3] ...
  -h    Print this message
  -f    File extensions to search for (comma separated list)
  -v    Causes wget output to be displayed

Examples

./reap.pl -f rar http://thissite.com

Would download all RAR files to the current directory from http://thissite.com.

./reap.pl -f zip,iso http://distrosite.com

Would download both ZIP and ISO files to the current directory from http://distrosite.com.

How it Works

The script detects files in a tags. For example:

<a href="filename.rar">RAR file</a>

Gets found with the regex pattern and downloaded with wget. Then it checks if the file needs to be gunzipped and unzips it if necessary. This also means the script will not detect image tags and download images. It attempts to clean up improper characters that are sometimes found on sites as well as makes sure the URLs are proper.

Original

by Mike A. Leonetti http://bit.ly/19yz7uz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment