creaktive · May 7, 2021 10:24
diff --git a/yada-crawler.pl b/yada-crawler.pl
 #!/usr/bin/env perl
 use 5.016;
 use common::sense;
 use utf8::all;

 # Use fast binary libraries
 use EV;
 use Web::Scraper::LibXML;
 use YADA 0.039;

 YADA->new(
    common_opts => {
        # Available opts @ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html
        encoding        => '',
        followlocation  => 1,
        maxredirs       => 5,
    }, http_response => 1, max => 4,
 )->append([qw[
    http://sysd.org/page/1/
    http://sysd.org/page/2/
    http://sysd.org/page/3/
 ]] => sub {
    my ($self) = @_;
    return  if $self->has_error
        or not $self->response->is_success
        or not $self->response->content_is_html;

    # Declare the scraper once and then reuse it
    state $scraper = scraper {
        process q(html title), title => q(text);
        process q(a), q(links[]) => q(@href);
    };

    # Employ amazing Perl (en|de)coding powers to handle HTML charsets
    my $doc = $scraper->scrape(
        $self->response->decoded_content,
        $self->final_url,
    );

    printf qq(%-64s %s\n), $self->final_url, $doc->{title};

    # Enqueue links from the parsed page
    $self->queue->prepend([
        grep {
            $_->can(q(host)) and $_->scheme =~ m{^https?$}x
            and $_->host eq $self->initial_url->host
            and (grep { length } $_->path_segments) <= 3
        } @{$doc->{links} // []}
    ] => __SUB__);
 })->wait;
 __DATA__
 Featured at:
 http://blogs.perl.org/users/stas/2013/02/web-scraping-with-modern-perl-part-2---speed-edition.html
	#!/usr/bin/env perl
	use 5.016;
	use common::sense;
	use utf8::all;

	# Use fast binary libraries
	use EV;
	use Web::Scraper::LibXML;
	use YADA 0.039;

	YADA->new(
	common_opts => {
	# Available opts @ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html
	encoding => '',
	followlocation => 1,
	maxredirs => 5,
	}, http_response => 1, max => 4,
	)->append([qw[
	http://sysd.org/page/1/
	http://sysd.org/page/2/
	http://sysd.org/page/3/
	]] => sub {
	my ($self) = @_;
	return if $self->has_error
	or not $self->response->is_success
	or not $self->response->content_is_html;

	# Declare the scraper once and then reuse it
	state $scraper = scraper {
	process q(html title), title => q(text);
	process q(a), q(links[]) => q(@href);
	};

	# Employ amazing Perl (en\|de)coding powers to handle HTML charsets
	my $doc = $scraper->scrape(
	$self->response->decoded_content,
	$self->final_url,
	);

	printf qq(%-64s %s\n), $self->final_url, $doc->{title};

	# Enqueue links from the parsed page
	$self->queue->prepend([
	grep {
	$_->can(q(host)) and $_->scheme =~ m{^https?$}x
	and $_->host eq $self->initial_url->host
	and (grep { length } $_->path_segments) <= 3
	} @{$doc->{links} // []}
	] => __SUB__);
	})->wait;
	__DATA__
	Featured at:
	http://blogs.perl.org/users/stas/2013/02/web-scraping-with-modern-perl-part-2---speed-edition.html