creaktive · November 28, 2020 13:15 · gdog2u · May 3, 2017 · ghost · Jul 27, 2017
diff --git a/mojo-crawler.pl b/mojo-crawler.pl
 #!/usr/bin/env perl
 use 5.010;
 use open qw(:locale);
 use strict;
 use utf8;
 use warnings qw(all);

 use Mojo::UserAgent;

 # FIFO queue
 my @urls = map { Mojo::URL->new($_) } qw(
    http://sysd.org/page/1/
    http://sysd.org/page/2/
    http://sysd.org/page/3/
 );

 # Limit parallel connections to 4
 my $max_conn = 4;

 # User agent following up to 5 redirects
 my $ua = Mojo::UserAgent->new(max_redirects => 5);
 $ua->proxy->detect;

 # Keep track of active connections
 my $active = 0;

 Mojo::IOLoop->recurring(
    0 => sub {
        for ($active + 1 .. $max_conn) {

            # Dequeue or halt if there are no active crawlers anymore
            return ($active or Mojo::IOLoop->stop)
                unless my $url = shift @urls;

            # Fetch non-blocking just by adding
            # a callback and marking as active
            ++$active;
            $ua->get($url => \&get_callback);
        }
    }
 );

 # Start event loop if necessary
 Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

 sub get_callback {
    my (undef, $tx) = @_;

    # Deactivate
    --$active;

    # Parse only OK HTML responses
    return
        if not $tx->res->is_status_class(200)
        or $tx->res->headers->content_type !~ m{^text/html\b}ix;

    # Request URL
    my $url = $tx->req->url;

    say $url;
    parse_html($url, $tx);

    return;
 }

 sub parse_html {
    my ($url, $tx) = @_;

    say $tx->res->dom->at('html title')->text;

    # Extract and enqueue URLs
    for my $e ($tx->res->dom('a[href]')->each) {

        # Validate href attribute
        my $link = Mojo::URL->new($e->{href});
        next if 'Mojo::URL' ne ref $link;

        # "normalize" link
        $link = $link->to_abs($tx->req->url)->fragment(undef);
        next unless grep { $link->protocol eq $_ } qw(http https);

        # Don't go deeper than /a/b/c
        next if @{$link->path->parts} > 3;

        # Access every link only once
        state $uniq = {};
        ++$uniq->{$url->to_string};
        next if ++$uniq->{$link->to_string} > 1;

        # Don't visit other hosts
        next if $link->host ne $url->host;

        push @urls, $link;
        say " -> $link";
    }
    say '';

    return;
 }
 __DATA__
 Featured at:
 http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
	#!/usr/bin/env perl
	use 5.010;
	use open qw(:locale);
	use strict;
	use utf8;
	use warnings qw(all);

	use Mojo::UserAgent;

	# FIFO queue
	my @urls = map { Mojo::URL->new($_) } qw(
	http://sysd.org/page/1/
	http://sysd.org/page/2/
	http://sysd.org/page/3/
	);

	# Limit parallel connections to 4
	my $max_conn = 4;

	# User agent following up to 5 redirects
	my $ua = Mojo::UserAgent->new(max_redirects => 5);
	$ua->proxy->detect;

	# Keep track of active connections
	my $active = 0;

	Mojo::IOLoop->recurring(
	0 => sub {
	for ($active + 1 .. $max_conn) {

	# Dequeue or halt if there are no active crawlers anymore
	return ($active or Mojo::IOLoop->stop)
	unless my $url = shift @urls;

	# Fetch non-blocking just by adding
	# a callback and marking as active
	++$active;
	$ua->get($url => \&get_callback);
	}
	}
	);

	# Start event loop if necessary
	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;

	sub get_callback {
	my (undef, $tx) = @_;

	# Deactivate
	--$active;

	# Parse only OK HTML responses
	return
	if not $tx->res->is_status_class(200)
	or $tx->res->headers->content_type !~ m{^text/html\b}ix;

	# Request URL
	my $url = $tx->req->url;

	say $url;
	parse_html($url, $tx);

	return;
	}

	sub parse_html {
	my ($url, $tx) = @_;

	say $tx->res->dom->at('html title')->text;

	# Extract and enqueue URLs
	for my $e ($tx->res->dom('a[href]')->each) {

	# Validate href attribute
	my $link = Mojo::URL->new($e->{href});
	next if 'Mojo::URL' ne ref $link;

	# "normalize" link
	$link = $link->to_abs($tx->req->url)->fragment(undef);
	next unless grep { $link->protocol eq $_ } qw(http https);

	# Don't go deeper than /a/b/c
	next if @{$link->path->parts} > 3;

	# Access every link only once
	state $uniq = {};
	++$uniq->{$url->to_string};
	next if ++$uniq->{$link->to_string} > 1;

	# Don't visit other hosts
	next if $link->host ne $url->host;

	push @urls, $link;
	say " -> $link";
	}
	say '';

	return;
	}
	__DATA__
	Featured at:
	http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
No results found