Last active
January 16, 2018 09:17
-
-
Save robhammond/5597327 to your computer and use it in GitHub Desktop.
Event source mojolicious crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package WebCrawl::Crawl; | |
use Mojo::Base 'Mojolicious::Controller'; | |
sub crawl { | |
my $self = shift; | |
# Increase inactivity timeout for connection a bit | |
Mojo::IOLoop->stream($self->tx->connection)->timeout(15); | |
# Change content type | |
$self->res->headers->content_type('text/event-stream'); | |
my $seed_url = $self->param('url'); | |
# FIFO queue | |
my @urls = (Mojo::URL->new($seed_url)); | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
# Track accessed URLs | |
my %uniq; | |
# already visited seed url, duh | |
$uniq{$seed_url} = 1; | |
my $active = 0; | |
our $url_count = 0; | |
sub parse { | |
my ($tx) = @_; | |
# Request URL | |
my $url = $tx->req->url; | |
my $title = $tx->res->dom->at('html title')->text; | |
my $json = Mojo::JSON->new; | |
my $data = $json->encode({ | |
url => $url, | |
title => $title, | |
}); | |
$self->write("event:url\ndata:$data\n\n"); | |
# Extract and enqueue URLs | |
for my $e ($tx->res->dom('a[href]')->each) { | |
# Validate href attribute | |
my $link = Mojo::URL->new($e->{href}); | |
next if 'Mojo::URL' ne ref $link; | |
# "normalize" link | |
$link = $link->to_abs($tx->req->url)->fragment(undef); | |
next unless $link->protocol =~ /^https?$/x; | |
# Access every link only once | |
next if ++$uniq{$link->to_string} > 1; | |
# Don't visit other hosts (inc sub-doms) | |
next if $link->host ne $url->host; | |
push @urls, $link; | |
} | |
return; | |
} | |
sub get_callback { | |
my (undef, $tx) = @_; | |
# Exit if request times out | |
if (!$tx->res->code) { | |
my $json = Mojo::JSON->new; | |
my $data = $json->encode({ | |
url => $tx->req->url, | |
title => "Error: time out", | |
status => 500, | |
}); | |
$self->write("event:url\ndata:$data\n\n"); | |
return; | |
} | |
# Parse only OK HTML responses | |
$tx->res->code == 200 | |
and | |
$tx->res->headers->content_type =~ m{^text/html\b}ix | |
and | |
parse($tx); | |
# Deactivate | |
--$active; | |
return; | |
} | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
# Keep up to 4 parallel crawlers sharing the same user agent | |
for ($active .. 4 - 1) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
# Limit to x number of URLs | |
return ($active or Mojo::IOLoop->stop) | |
unless ($url_count < 30); | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; | |
if ($rules->allowed($url)) { | |
$ua->get($url => \&get_callback); | |
} | |
++$url_count; | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment