-
-
Save korjavin/fc16f7b6d79e903fa5e0 to your computer and use it in GitHub Desktop.
Simple web crawler/scraper implemented using Mojolicious
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.010; | |
use open qw(:locale); | |
use strict; | |
use utf8; | |
use warnings qw(all); | |
use Mojo::UserAgent; | |
use Data::Dumper; | |
# FIFO queue | |
my @urls = map { Mojo::URL->new($_) } qw( | |
http://spravker.ru/ | |
); | |
# Limit parallel connections to 4 | |
my $max_conn = 4; | |
# User agent following up to 5 redirects | |
my $ua = Mojo::UserAgent->new(max_redirects => 5); | |
$ua->proxy->detect; | |
# Keep track of active connections | |
my $active = 0; | |
Mojo::IOLoop->recurring( | |
0 => sub { | |
for ($active + 1 .. $max_conn) { | |
# Dequeue or halt if there are no active crawlers anymore | |
return ($active or Mojo::IOLoop->stop) | |
unless my $url = shift @urls; | |
# Fetch non-blocking just by adding | |
# a callback and marking as active | |
++$active; | |
$ua->get($url => \&get_callback); | |
} | |
} | |
); | |
# Start event loop if necessary | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
sub get_callback { | |
my (undef, $tx) = @_; | |
# Deactivate | |
--$active; | |
# Parse only OK HTML responses | |
return | |
if not $tx->res->is_status_class(200) | |
or $tx->res->headers->content_type !~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
print "$url \t"; | |
parse_html($url, $tx); | |
return; | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
say $tx->res->dom->at('html title')->text; | |
for my $e ($tx->res->dom('#region-name > span.region-change > span > cufon > cufontext')->each) { | |
print "$e\n"; | |
return; | |
} | |
# Extract and enqueue URLs | |
for my $e ($tx->res->dom('a[href]')->each) { | |
# Validate href attribute | |
my $link = Mojo::URL->new($e->{href}); | |
next if 'Mojo::URL' ne ref $link; | |
# "normalize" link | |
$link = $link->to_abs($tx->req->url)->fragment(undef); | |
next unless grep { $link->protocol eq $_ } qw(http https); | |
# Don't go deeper than /a/b/c | |
next if @{$link->path->parts} > 0; | |
# Access every link only once | |
state $uniq = {}; | |
++$uniq->{$url->to_string}; | |
next if ++$uniq->{$link->to_string} > 1; | |
# Don't visit other hosts | |
next unless $link->host =~ /(\w+).spravker.ru/; | |
$link .= '/'; | |
push @urls, $link; | |
} | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment