jberger · November 10, 2022 20:58 · jberger · Mar 13, 2013
diff --git a/URLQueue.pl b/URLQueue.pl
 #!/usr/bin/env perl
 package Mojo::URLQueue;
 use Mojo::Base 'Mojo::EventEmitter';

 use Mojo::UserAgent;

 has queue => sub { [] };
 has ua    => sub { Mojo::UserAgent->new(max_redirects => 5) };
 has concurrency => 4;

 sub start {
  my ($self, $cb) = @_;

  return unless @{ $self->queue };

  unless ( $self->{delay} ) {
    $self->{concurrent} = 0;
    $self->{delay} = Mojo::IOLoop->delay;
    $self->{delay}->on(finish => sub{
      warn("Loop ended before queue depleted\n") if @{ $self->queue };
      undef $self->{delay};
      $self->$cb() if $cb;
    });
  }

  $self->_refresh;

  # Start event loop if necessary
  $self->{delay}->wait unless $self->{delay}->ioloop->is_running;
 }

 sub _refresh {
  my $self = shift;

  my $concurrency = $self->concurrency;
  while ( $self->{concurrent} < $concurrency and my $url = shift @{ $self->queue } ) {
    $self->{concurrent}++;
    my $end = $self->{delay}->begin;

    $self->ua->get($url => sub{
      my ($ua, $tx) = @_;
      $self->emit( process => $tx );

      # refresh worker pool
      $self->{concurrent}--; 
      $self->_refresh; 
      $end->(); 
    });
  }
 }

 package main;

 use Mojo::Base -strict;
 use Mojo::URL;
 use utf8::all;

 # FIFO queue
 my @urls = qw(
    http://sysd.org/page/1/
    http://sysd.org/page/2/
    http://sysd.org/page/3/
 );

 my $q = Mojo::URLQueue->new( queue => \@urls );
 $q->on( process => \&process );
 $q->start(sub { say 'Finished' });

 sub process {
    my ($q, $tx) = @_;

    my $queue = $q->queue;

    # Parse only OK HTML responses
    return unless 
        $tx->res->is_status_class(200)
        and $tx->res->headers->content_type =~ m{^text/html\b}ix;

    # Request URL
    my $url = $tx->req->url;
    say "Processing $url";
    push @$queue, parse_html($url, $tx);
 }

 sub parse_html {
    my ($url, $tx) = @_;

    state %visited;
    my @links;

    my $dom = $tx->res->dom;
    say $dom->at('html title')->text;

    # Extract and enqueue URLs
    $dom->find('a[href]')->each(sub{

        # Validate href attribute
        my $link = Mojo::URL->new($_->{href});
        return unless eval { $link->isa('Mojo::URL') };

        # "normalize" link
        $link = $link->to_abs($url)->fragment(undef);
        return unless grep { $link->protocol eq $_ } qw(http https);

        # Don't go deeper than /a/b/c
        return if @{$link->path->parts} > 3;

        # Access every link only once
        return if $visited{$link->to_string}++;

        # Don't visit other hosts
        return if $link->host ne $url->host;

        push @links, $link;
        say " -> $link";
    });
    say '';

    return @links;
 }
	#!/usr/bin/env perl
	package Mojo::URLQueue;
	use Mojo::Base 'Mojo::EventEmitter';

	use Mojo::UserAgent;

	has queue => sub { [] };
	has ua => sub { Mojo::UserAgent->new(max_redirects => 5) };
	has concurrency => 4;

	sub start {
	my ($self, $cb) = @_;

	return unless @{ $self->queue };

	unless ( $self->{delay} ) {
	$self->{concurrent} = 0;
	$self->{delay} = Mojo::IOLoop->delay;
	$self->{delay}->on(finish => sub{
	warn("Loop ended before queue depleted\n") if @{ $self->queue };
	undef $self->{delay};
	$self->$cb() if $cb;
	});
	}

	$self->_refresh;

	# Start event loop if necessary
	$self->{delay}->wait unless $self->{delay}->ioloop->is_running;
	}

	sub _refresh {
	my $self = shift;

	my $concurrency = $self->concurrency;
	while ( $self->{concurrent} < $concurrency and my $url = shift @{ $self->queue } ) {
	$self->{concurrent}++;
	my $end = $self->{delay}->begin;

	$self->ua->get($url => sub{
	my ($ua, $tx) = @_;
	$self->emit( process => $tx );

	# refresh worker pool
	$self->{concurrent}--;
	$self->_refresh;
	$end->();
	});
	}
	}

	package main;

	use Mojo::Base -strict;
	use Mojo::URL;
	use utf8::all;

	# FIFO queue
	my @urls = qw(
	http://sysd.org/page/1/
	http://sysd.org/page/2/
	http://sysd.org/page/3/
	);

	my $q = Mojo::URLQueue->new( queue => \@urls );
	$q->on( process => \&process );
	$q->start(sub { say 'Finished' });

	sub process {
	my ($q, $tx) = @_;

	my $queue = $q->queue;

	# Parse only OK HTML responses
	return unless
	$tx->res->is_status_class(200)
	and $tx->res->headers->content_type =~ m{^text/html\b}ix;

	# Request URL
	my $url = $tx->req->url;
	say "Processing $url";
	push @$queue, parse_html($url, $tx);
	}

	sub parse_html {
	my ($url, $tx) = @_;

	state %visited;
	my @links;

	my $dom = $tx->res->dom;
	say $dom->at('html title')->text;

	# Extract and enqueue URLs
	$dom->find('a[href]')->each(sub{

	# Validate href attribute
	my $link = Mojo::URL->new($_->{href});
	return unless eval { $link->isa('Mojo::URL') };

	# "normalize" link
	$link = $link->to_abs($url)->fragment(undef);
	return unless grep { $link->protocol eq $_ } qw(http https);

	# Don't go deeper than /a/b/c
	return if @{$link->path->parts} > 3;

	# Access every link only once
	return if $visited{$link->to_string}++;

	# Don't visit other hosts
	return if $link->host ne $url->host;

	push @links, $link;
	say " -> $link";
	});
	say '';

	return @links;
	}