robhammond · January 16, 2018 09:17
diff --git a/mojo-crawler b/mojo-crawler
 package WebCrawl::Crawl;
 use Mojo::Base 'Mojolicious::Controller';

 sub crawl {
  my $self = shift;

 	# Increase inactivity timeout for connection a bit
 	Mojo::IOLoop->stream($self->tx->connection)->timeout(15);

 	# Change content type
 	$self->res->headers->content_type('text/event-stream');

 	my $seed_url = $self->param('url');
 	
 	# FIFO queue
 	my @urls = (Mojo::URL->new($seed_url));

 	# User agent following up to 5 redirects
 	my $ua = Mojo::UserAgent->new(max_redirects => 5);

 	# Track accessed URLs
 	my %uniq;

 	# already visited seed url, duh
 	$uniq{$seed_url} = 1;

 	my $active = 0;
 	our $url_count = 0;

 	sub parse {
 	    my ($tx) = @_;

 	    # Request URL
 	    my $url = $tx->req->url;
 	    
 	    my $title = $tx->res->dom->at('html title')->text;

 	    my $json = Mojo::JSON->new;
 		my $data = $json->encode({
 			url => $url, 
 			title => $title,
 			});

 	    $self->write("event:url\ndata:$data\n\n");

 	    # Extract and enqueue URLs
 	    for my $e ($tx->res->dom('a[href]')->each) {

 	        # Validate href attribute
 	        my $link = Mojo::URL->new($e->{href});
 	        next if 'Mojo::URL' ne ref $link;

 	        # "normalize" link
 	        $link = $link->to_abs($tx->req->url)->fragment(undef);
 	        next unless $link->protocol =~ /^https?$/x;

 	        # Access every link only once
 	        next if ++$uniq{$link->to_string} > 1;

 	        # Don't visit other hosts (inc sub-doms)
 	        next if $link->host ne $url->host;

 	        push @urls, $link;
 	    }
 	    return;
 	}

 	sub get_callback {
 	    my (undef, $tx) = @_;
 	    # Exit if request times out
 	    if (!$tx->res->code) {
 	        my $json = Mojo::JSON->new;
 			my $data = $json->encode({
 				url => $tx->req->url, 
 				title => "Error: time out", 
 				status => 500,
 				});

 	        $self->write("event:url\ndata:$data\n\n");
 	        return;
 	    }
 	    # Parse only OK HTML responses
 	    $tx->res->code == 200
 	        and
 	    $tx->res->headers->content_type =~ m{^text/html\b}ix
 	        and
 	    parse($tx);

 	    # Deactivate
 	    --$active;

 	    return;
 	}

 	Mojo::IOLoop->recurring(
 	    0 => sub {

 	        # Keep up to 4 parallel crawlers sharing the same user agent
 	        for ($active .. 4 - 1) {

 	            # Dequeue or halt if there are no active crawlers anymore
 	            return ($active or Mojo::IOLoop->stop)
 	                unless my $url = shift @urls;

 	            # Limit to x number of URLs
 	            return ($active or Mojo::IOLoop->stop)
 	            	unless ($url_count < 30);

 	            # Fetch non-blocking just by adding
 	            # a callback and marking as active
 	            ++$active;
 	            if ($rules->allowed($url)) {
 		            $ua->get($url => \&get_callback);
 		        }
 	            ++$url_count;
 	        }
 	    }
 	);

 	# Start event loop if necessary
 	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
 }

 1;
	package WebCrawl::Crawl;
	use Mojo::Base 'Mojolicious::Controller';

	sub crawl {
	my $self = shift;

	# Increase inactivity timeout for connection a bit
	Mojo::IOLoop->stream($self->tx->connection)->timeout(15);

	# Change content type
	$self->res->headers->content_type('text/event-stream');

	my $seed_url = $self->param('url');

	# FIFO queue
	my @urls = (Mojo::URL->new($seed_url));

	# User agent following up to 5 redirects
	my $ua = Mojo::UserAgent->new(max_redirects => 5);

	# Track accessed URLs
	my %uniq;

	# already visited seed url, duh
	$uniq{$seed_url} = 1;

	my $active = 0;
	our $url_count = 0;

	sub parse {
	my ($tx) = @_;

	# Request URL
	my $url = $tx->req->url;

	my $title = $tx->res->dom->at('html title')->text;

	my $json = Mojo::JSON->new;
	my $data = $json->encode({
	url => $url,
	title => $title,
	});

	$self->write("event:url\ndata:$data\n\n");

	# Extract and enqueue URLs
	for my $e ($tx->res->dom('a[href]')->each) {

	# Validate href attribute
	my $link = Mojo::URL->new($e->{href});
	next if 'Mojo::URL' ne ref $link;

	# "normalize" link
	$link = $link->to_abs($tx->req->url)->fragment(undef);
	next unless $link->protocol =~ /^https?$/x;

	# Access every link only once
	next if ++$uniq{$link->to_string} > 1;

	# Don't visit other hosts (inc sub-doms)
	next if $link->host ne $url->host;

	push @urls, $link;
	}
	return;
	}

	sub get_callback {
	my (undef, $tx) = @_;
	# Exit if request times out
	if (!$tx->res->code) {
	my $json = Mojo::JSON->new;
	my $data = $json->encode({
	url => $tx->req->url,
	title => "Error: time out",
	status => 500,
	});

	$self->write("event:url\ndata:$data\n\n");
	return;
	}
	# Parse only OK HTML responses
	$tx->res->code == 200
	and
	$tx->res->headers->content_type =~ m{^text/html\b}ix
	and
	parse($tx);

	# Deactivate
	--$active;

	return;
	}

	Mojo::IOLoop->recurring(
	0 => sub {

	# Keep up to 4 parallel crawlers sharing the same user agent
	for ($active .. 4 - 1) {

	# Dequeue or halt if there are no active crawlers anymore
	return ($active or Mojo::IOLoop->stop)
	unless my $url = shift @urls;

	# Limit to x number of URLs
	return ($active or Mojo::IOLoop->stop)
	unless ($url_count < 30);

	# Fetch non-blocking just by adding
	# a callback and marking as active
	++$active;
	if ($rules->allowed($url)) {
	$ua->get($url => \&get_callback);
	}
	++$url_count;
	}
	}
	);

	# Start event loop if necessary
	Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
	}

	1;