Skip to content

Instantly share code, notes, and snippets.

@hakobe
Created August 10, 2010 11:25
Show Gist options
  • Select an option

  • Save hakobe/517110 to your computer and use it in GitHub Desktop.

Select an option

Save hakobe/517110 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
use AnyEvent;
use AnyEvent::HTTP;
use Coro;
use Coro::SemaphoreSet;
use Data::Dumper;
use URI;
use Web::Scraper;
use Regexp::Common qw /URI/;
my @QUEUE = map { URI->new($_) } (
'http://b.hatena.ne.jp',
);
my $cb = AnyEvent->condvar;
my $s = Coro::SemaphoreSet->new(5);
my $scraper = scraper {
process 'a', 'links[]' => '@href';
};
my %DONE = ();
async {
while (1) {
Coro::AnyEvent::poll();
my $url = pop @QUEUE;
next unless $url;
next if $url !~ m/^http/;
warn 'START ' . $url;
next if $DONE{$url};
$s->down($url->host);
http_get $url, sub {
my ($data, $headers) = @_;
warn 'DONE ' . $url;
if ($data) {
my $links = $scraper->scrape($data)->{links};
for my $link (@$links) {
if ($link && $link =~ m/$RE{URI}{HTTP}/){
unshift @QUEUE, URI->new($link);
}
}
}
$DONE{$url} = $url;
$s->up($url->host);
};
}
};
$cb->recv;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment