toritori0318 · July 9, 2011 15:35
diff --git a/scrappy2.pl b/scrappy2.pl
 use strict;
 use Scrappy;

 my $s = Scrappy->new;
 # １秒間隔でクロール
 $s->pause(1);
 # ファイルにログを保存
 $s->logger->file('scrappy.log');

 # クロール実行
 $s->crawl('http://b.hatena.ne.jp/',
    # 1. root
    '/' => {
        # 「タグ一覧」へのリンク
        '//a[ @href="/t" ]' => sub {
            my ($self, $item) = @_;
            $self->queue->add($item->{href});
        }
    },
    # 2. タグ一覧
    '/t' => {
        # 2.1. 「perl」タグを見つけてキューにpush
        '//a[ @href="/t/perl" ]' => sub {
            my ($self, $item) = @_;
            $self->queue->add($item->{href});
        }
    },
    # 3. タグ(perl)
    '/t/:tag' => {
        # 3.1. エントリのリンクをたどって、リンク先のテキスト部分のみを標準出力
        'body' => sub {
            my ($self, $item, $args) = @_;
            # エントリのリンクを抜き出すxpath
            my $entries = $self->select('//div[@class="entry-body"]/h3/a');
            for my $entry (@{$entries->data}) {
                $s->log('info', "found entry: " . $entry->{title});
                # 別サイトの場合はインスタンス作る
                my $sd = Scrappy->new;
                $sd->get($entry->{href});
                print $sd->page_text, "\n";
            }
        },
        # 3.2. 「次へ」リンクをキューにpush
        '//a[ @class="pager-next" ]' => sub {
            my ($self, $item) = @_;
            print "goto next page... \n";
            $self->queue->add($item->{href});
        },
    },
 );
	use strict;
	use Scrappy;

	my $s = Scrappy->new;
	# １秒間隔でクロール
	$s->pause(1);
	# ファイルにログを保存
	$s->logger->file('scrappy.log');

	# クロール実行
	$s->crawl('http://b.hatena.ne.jp/',
	# 1. root
	'/' => {
	# 「タグ一覧」へのリンク
	'//a[ @href="/t" ]' => sub {
	my ($self, $item) = @_;
	$self->queue->add($item->{href});
	}
	},
	# 2. タグ一覧
	'/t' => {
	# 2.1. 「perl」タグを見つけてキューにpush
	'//a[ @href="/t/perl" ]' => sub {
	my ($self, $item) = @_;
	$self->queue->add($item->{href});
	}
	},
	# 3. タグ(perl)
	'/t/:tag' => {
	# 3.1. エントリのリンクをたどって、リンク先のテキスト部分のみを標準出力
	'body' => sub {
	my ($self, $item, $args) = @_;
	# エントリのリンクを抜き出すxpath
	my $entries = $self->select('//div[@class="entry-body"]/h3/a');
	for my $entry (@{$entries->data}) {
	$s->log('info', "found entry: " . $entry->{title});
	# 別サイトの場合はインスタンス作る
	my $sd = Scrappy->new;
	$sd->get($entry->{href});
	print $sd->page_text, "\n";
	}
	},
	# 3.2. 「次へ」リンクをキューにpush
	'//a[ @class="pager-next" ]' => sub {
	my ($self, $item) = @_;
	print "goto next page... \n";
	$self->queue->add($item->{href});
	},
	},
	);
No results found