Created
November 27, 2012 17:46
-
-
Save robhammond/4155823 to your computer and use it in GitHub Desktop.
Basic Mojo UserAgent parallel crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use Modern::Perl; | |
use Mojo::UserAgent; | |
use Mojo::IOLoop; | |
use Mojo::URL; | |
# FIFO queue | |
my $seed_url = 'http://www.google.co.uk/'; | |
my @urls = ($seed_url); | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
my %seen = ($seed_url => 1); | |
my $active = 0; | |
Mojo::IOLoop->recurring(0 => sub { | |
# Keep up to 4 parallel crawlers sharing the same user agent | |
for ($active .. 4 - 1) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) unless my $url = shift @urls; | |
# Fetch non-blocking just by adding a callback and marking as active | |
++$active; | |
$ua->get($url => sub { | |
my ($ua, $tx) = @_; | |
# Extract and enqueue URLs | |
say $url; | |
for my $e ($tx->res->dom('a[href]')->each) { | |
my $link = Mojo::URL->new($e->{href})->to_abs($tx->req->url); | |
$link = clean_url($link); | |
next unless is_valid_url($link); | |
next if ($seen{$link}); | |
push @urls, $link; | |
$seen{$link} = 1; | |
say " -> $urls[-1]"; | |
} | |
my $title = $tx->res->dom->at('title')->text; | |
# Deactivate | |
--$active; | |
}); | |
} | |
}); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
sub is_valid_url { | |
my $url = shift; | |
if ($url =~ m{\.(?:css|js|png|pdf|jpe?g|docx?|xlsx?|pptx?|gif)$}i) { | |
return undef; | |
} | |
# don't crawl external sites | |
unless ($url =~ m{^$seed_url}i) { | |
return undef; | |
} | |
return 1; | |
} | |
sub clean_url { | |
my $link = shift; | |
# remove url fragments | |
$link =~ s!#.*$!!; | |
# remove session id params | |
$link =~ s![?&]j?session(?:id)?=[^&]+!!gi; | |
# remove google analytics tracking | |
$link =~ s![?&]utm_(?:medium|source|campaign|content)=[^&]+!!gi; | |
return $link; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment