Skip to content

Instantly share code, notes, and snippets.

@ultraist
Created September 24, 2009 13:22
Show Gist options
  • Select an option

  • Save ultraist/192730 to your computer and use it in GitHub Desktop.

Select an option

Save ultraist/192730 to your computer and use it in GitHub Desktop.
# bou_imouto_crawler
use strict;
use warnings;
use Web::Scraper;
use URI;
use Data::Dumper;
use LWP::UserAgent;
use constant {
TAG_FILE => './tags.txt', # 手動で用意
MAX_PAGE => 10
};
$|=1;
sub tags
{
my $tag_file = shift;
my @tags;
open(T, $tag_file) || die $!;
while (my $line = <T>) {
chomp($line);
push(@tags, $line);
}
close(T);
return @tags;
}
sub max_page
{
my $tag = shift;
my $uri = new URI("http://moe.imouto.org/post?&tags=$tag");
my $pagination = scraper {
process '.pagination a', 'pages[]' => 'TEXT';
}->scrape($uri);
pop(@{$pagination->{pages}});
my @pages = sort { $b <=> $a } @{$pagination->{pages}};
return $pages[0];
}
sub page_imgs
{
my($tag, $page) = @_;
my @page_imgs;
my $uri = new URI("http://moe.imouto.org/post?page=$page&tags=$tag");
my $preview = scraper {
process '#post-list-posts .thumb a', 'urls[]' => '@href';
}->scrape($uri);
foreach my $preview_url (@{$preview->{urls}}) {
my $uri = new URI($preview_url);
my $img = scraper {
process '#image', 'src' => '@src';
}->scrape($uri);
if ($img->{src}) {
push(@page_imgs, $img->{src});
}
}
return @page_imgs;
}
sub tag_imgs
{
my $tag = shift;
my $max_page = max_page($tag);
my @tag_imgs;
if ($max_page > MAX_PAGE) {
$max_page = MAX_PAGE;
}
for (my $page = 1; $page <= $max_page; ++$page) {
my @preview_imgs = page_imgs($tag, $page);
push(@tag_imgs, @preview_imgs);
}
return @tag_imgs;
}
my $start_tag_idx = shift(@ARGV) || 1; # 途中から開始できるように
bou_imouto_crawler:
{
my @tags = tags(TAG_FILE);
my $c = $start_tag_idx - 1;
for (my $i = $start_tag_idx - 1; $i < scalar(@tags); ++$i) {
my $tag = $tags[$i];
printf("%s (%d/%d) fetch urls%s\r", $tag, $c + 1, scalar(@tags), ' ' x 40);
my $dir = sprintf("%d_%s", $c, $tag);
if (!-d $dir) {
mkdir($dir);
}
my @tag_imgs = sort { $a cmp $b } tag_imgs($tag);
open(I, '>', sprintf("%s.txt", $dir));
foreach my $iu (@tag_imgs) {
print I $iu,"\n";
}
close(I);
my $ua = new LWP::UserAgent(keep_alive => 1);
my $img_id = 0;
foreach my $uri (@tag_imgs) {
printf("%s (%d/%d) fetch imgs (%d/%d)\r", $tag, $c + 1, scalar(@tags), $img_id, scalar(@tag_imgs));
my $filename = sprintf("%s/%d.jpg", $dir, $img_id);
my $req = new HTTP::Request("GET", $uri);
my $res = $ua->request($req, $filename);
if (!$res->is_success) {
warn "\nWARN:", $res->status_line, "\n";
}
++$img_id;
}
print "\n";
++$c;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment