Last active
October 1, 2015 21:29
-
-
Save robhammond/c6f9f1677c6a3d166a5e to your computer and use it in GitHub Desktop.
Non-blocking HTTP status checker script using Mojolicious. Accepts text file 'urls.txt' and writes to 'http-status.csv'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use Modern::Perl; | |
use Mojo::Util qw(decode encode html_unescape xml_escape); | |
use Mojo::DOM; | |
use Mojo::Log; | |
use Mojo::Asset::File; | |
use File::Slurp; | |
use Mojo::UserAgent; | |
my $log = Mojo::Log->new; | |
# change user agent string and add contact email for webmasters | |
my $user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36'; | |
my $input_urls = 'urls.txt'; | |
my @urls = read_file($input_urls); | |
my $file = Mojo::Asset::File->new; | |
$file->add_chunk("Request URL\tResult URL\tStatus\tServer\tContent Type\tContent Length\tRedirects\n"); | |
# Limit parallel connections to 5 | |
my $max_conn = 5; | |
# User agent following up to 25 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 25); | |
$ua->proxy->detect; | |
# Keep track of active connections | |
my $active = 0; | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
for ($active + 1 .. $max_conn) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
chomp($url); | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; | |
$ua->get($url => sub { | |
my (undef, $tx) = @_; | |
# Deactivate | |
--$active; | |
if (!$tx->res->code) { | |
$log->info('error ' . $tx->res->status); | |
my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->error, '', '', '') ); | |
$file->add_chunk("$csv\n"); | |
return; | |
} | |
if ($tx->res->code =~ m{^[456789]}) { | |
my $csv = join( "\t", ($tx->req->url, $tx->req->url, $tx->res->code, $tx->res->headers->server, $tx->res->headers->content_type, $tx->res->headers->content_length) ); | |
$file->add_chunk("$csv\n"); | |
} | |
my $headers = $tx->res->headers->to_hash; | |
my @redirects; | |
my $redirects_csv = ''; | |
my $orig_url = $tx->req->url; | |
if ($tx->redirects) { | |
my $i = 0; | |
for my $redir (@{$tx->redirects}) { | |
if ($i == 0) { | |
$orig_url = $redir->req->url; | |
} | |
push @redirects, { url => $redir->req->url, status => $redir->res->code }; | |
$redirects_csv .= $redir->req->url . " (" . $redir->res->code . ")\t"; | |
$i++; | |
} | |
} | |
my $dhash = { | |
url => $tx->req->url, | |
status => $tx->res->code, | |
redirects => \@redirects, | |
server => $tx->res->headers->server, | |
content_type => $tx->res->headers->content_type, | |
content_length => $tx->res->headers->content_length, | |
msg => $tx->res->message, | |
}; | |
my $csv = join( "\t", ($orig_url, $dhash->{url}, $dhash->{status}, $dhash->{server}, $dhash->{content_type}, $dhash->{content_length}) ); | |
say $csv; | |
$file->add_chunk("$csv\t$redirects_csv\n"); | |
return; | |
}); | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
$file->move_to('http-status.tsv'); | |
say $file->slurp; | |
__DATA__ | |
Forked from: | |
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment