Created
August 13, 2009 16:36
-
-
Save fuba/167287 to your computer and use it in GitHub Desktop.
known bug: サンプル画像が無いアイテムの画像が出せない、gifとjpgの判定は表紙画像だけでできるので要修正
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
use DateTime; | |
use DateTime::Format::Strptime qw( strptime ); | |
use Encode qw( decode encode ); | |
use Time::ParseDate; | |
use URI; | |
use Web::Scraper 0.22; | |
use YAML; | |
my $t = parsedate(shift || 'today'); | |
my $dt = DateTime->from_epoch(epoch => $t, time_zone => 'Asia/Tokyo'); | |
my $ss = scraper { | |
process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper { | |
process '//tr[1]/td/font', title => 'text'; | |
process '//tr[2]/td/font/a', author => 'text'; | |
process '//tr[3]/td', | |
link => ['text', \&mk_link]; | |
process '//tr[4]/td/a', 'tags[]' => 'text'; | |
process '//tr[5]/td/a[1]', | |
body => ['@href', \&mk_body], | |
process '//tr[last()-1]/td/a', | |
date => ['text', \&mk_date]; | |
result qw( title author link body tags date ); | |
}; | |
result qw( entry ); | |
}; | |
$ss->user_agent->env_proxy; | |
my $sig = scraper { | |
process '//table[@class="list_desc_table"]//table[@class="list_desc_innertable_text"]', 'entry[]' => scraper { | |
process '//tr[1]/td/font', title => 'text'; | |
process '//tr[2]/td/font/a', author => 'text'; | |
process '//tr[3]/td', | |
link => ['text', \&mk_link]; | |
process '//tr[4]/td/a', 'category' => 'text'; | |
process '//tr[5]/td/a', 'tags[]' => 'text'; | |
process '//tr[6]/td/a[1]', | |
body => ['@href', \&mk_body], | |
process '//tr[last()-1]/td/a', | |
date => ['text', \&mk_date]; | |
result qw( title author link body category tags date ); | |
}; | |
result qw( entry ); | |
}; | |
$sig->user_agent->env_proxy; | |
my $entry = [ | |
@{ &parse($ss, $dt, '同人誌') || [] }, | |
@{ &parse($sig, $dt, '同人ソフト') || [] }, | |
@{ &parse($sig, $dt, '同人グッズ') || [] }, | |
]; | |
for my $e (@$entry) { | |
unshift @{$e->{tags}}, $e->{category} if $e->{category}; | |
delete $e->{category} if defined $e->{category}; | |
} | |
my $dt_title = $dt->strftime('%Y年%m月%d日'); | |
binmode STDOUT, ":utf8"; | |
print Dump +{ | |
title => "メロンブックス通信販売 新着リスト ($dt_title)", | |
link => 'http://shop.melonbooks.co.jp/shop/list/', | |
entry => $entry, | |
}; | |
sub parse { | |
my ($scraper, $dt, $genre) = @_; | |
my @items; | |
my $result_num = 0; | |
my $offset = 10; | |
for (my $dispstart = 0; $dispstart < 300; $dispstart += $offset) { | |
my $url = URI->new('http://shop.melonbooks.co.jp/shop/list/'); | |
my %query = ( | |
'CR[]' => [18, 15, 0], | |
DA => 'dispstart', | |
'LA' => $dt->ymd, | |
SC => $dispstart, # オフセット | |
G => $genre, # ジャンル | |
E => '', # 在庫切れも含む ? '' : 'ON' | |
'ARRIVAL[]' => ['first', 'reall'], | |
ST => 0, | |
O => 'maker', #表示順 | |
P => $offset, # 表示件数 | |
DS => 'desc', | |
RATED => 18, # これを渡すと18禁認証スキップ | |
); | |
$url->query_form(%query); | |
my $res = $scraper->user_agent->get($url); | |
unless ($res->is_success) { | |
die "GET $url failed: " . $res->status_line; | |
} | |
my $content = decode('utf-8', $res->content); | |
if ($content =~ m|\(\d+~\d+件/全(\d+)件\)|m) { | |
$result_num = $1; | |
} | |
my $data = $scraper->scrape($content); | |
push @items, @{ $data || [] }; | |
last if ($dispstart + $offset > $result_num); | |
} | |
wantarray ? @items : \@items; | |
} | |
sub mk_link { | |
my $link = URI->new('http://shop.melonbooks.co.jp/shop/list/ID/'.$_); | |
$link->as_string; | |
} | |
sub mk_body { | |
my $key = $_; | |
if ($key =~ /(\d+)\w\.(\w+)$/) { | |
my $id = $1; | |
my $type = $2; | |
join '', map { | |
($type eq 'jpg') | |
? qq!<img src="http://shop.melonbooks.co.jp/img2/m/${id}${_}.jpg">! | |
: qq!<img src="http://shop.melonbooks.co.jp/img/${id}${_}.gif">! | |
} ('', 'a', 'b'); | |
} | |
} | |
sub mk_date { eval { strptime('%Y年%m月%d日', $_)->ymd } } | |
sub as_string { $_->as_string } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment