Last active
November 10, 2022 20:58
-
-
Save jberger/5153008 to your computer and use it in GitHub Desktop.
Modularization of my answer from SO on URL queuing for non-blocking ua
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
package Mojo::URLQueue; | |
use Mojo::Base 'Mojo::EventEmitter'; | |
use Mojo::UserAgent; | |
has queue => sub { [] }; | |
has ua => sub { Mojo::UserAgent->new(max_redirects => 5) }; | |
has concurrency => 4; | |
sub start { | |
my ($self, $cb) = @_; | |
return unless @{ $self->queue }; | |
unless ( $self->{delay} ) { | |
$self->{concurrent} = 0; | |
$self->{delay} = Mojo::IOLoop->delay; | |
$self->{delay}->on(finish => sub{ | |
warn("Loop ended before queue depleted\n") if @{ $self->queue }; | |
undef $self->{delay}; | |
$self->$cb() if $cb; | |
}); | |
} | |
$self->_refresh; | |
# Start event loop if necessary | |
$self->{delay}->wait unless $self->{delay}->ioloop->is_running; | |
} | |
sub _refresh { | |
my $self = shift; | |
my $concurrency = $self->concurrency; | |
while ( $self->{concurrent} < $concurrency and my $url = shift @{ $self->queue } ) { | |
$self->{concurrent}++; | |
my $end = $self->{delay}->begin; | |
$self->ua->get($url => sub{ | |
my ($ua, $tx) = @_; | |
$self->emit( process => $tx ); | |
# refresh worker pool | |
$self->{concurrent}--; | |
$self->_refresh; | |
$end->(); | |
}); | |
} | |
} | |
package main; | |
use Mojo::Base -strict; | |
use Mojo::URL; | |
use utf8::all; | |
# FIFO queue | |
my @urls = qw( | |
http://sysd.org/page/1/ | |
http://sysd.org/page/2/ | |
http://sysd.org/page/3/ | |
); | |
my $q = Mojo::URLQueue->new( queue => \@urls ); | |
$q->on( process => \&process ); | |
$q->start(sub { say 'Finished' }); | |
sub process { | |
my ($q, $tx) = @_; | |
my $queue = $q->queue; | |
# Parse only OK HTML responses | |
return unless | |
$tx->res->is_status_class(200) | |
and $tx->res->headers->content_type =~ m{^text/html\b}ix; | |
# Request URL | |
my $url = $tx->req->url; | |
say "Processing $url"; | |
push @$queue, parse_html($url, $tx); | |
} | |
sub parse_html { | |
my ($url, $tx) = @_; | |
state %visited; | |
my @links; | |
my $dom = $tx->res->dom; | |
say $dom->at('html title')->text; | |
# Extract and enqueue URLs | |
$dom->find('a[href]')->each(sub{ | |
# Validate href attribute | |
my $link = Mojo::URL->new($_->{href}); | |
return unless eval { $link->isa('Mojo::URL') }; | |
# "normalize" link | |
$link = $link->to_abs($url)->fragment(undef); | |
return unless grep { $link->protocol eq $_ } qw(http https); | |
# Don't go deeper than /a/b/c | |
return if @{$link->path->parts} > 3; | |
# Access every link only once | |
return if $visited{$link->to_string}++; | |
# Don't visit other hosts | |
return if $link->host ne $url->host; | |
push @links, $link; | |
say " -> $link"; | |
}); | |
say ''; | |
return @links; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See original at http://stackoverflow.com/a/15166898/468327