Created
October 31, 2010 08:38
-
-
Save ngs/656308 to your computer and use it in GitHub Desktop.
じゃらん 観光ガイドをすクレープする
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use strict; | |
| use warnings; | |
| use FindBin; | |
| use lib "$FindBin::RealBin/../lib"; | |
| use lib "$FindBin::RealBin/../extlib"; | |
| use Data::Dumper; | |
| use Test::More; | |
| use URI; | |
| BEGIN { use_ok 'JalanSpot' } | |
| BEGIN { use_ok 'JalanSpot::Spot' } | |
| my $s = JalanSpot->new; | |
| SKIP: { | |
| skip 1,1; | |
| my $spots = $s->list_spots('010000','012900'); | |
| print Dumper $spots; | |
| is 156, @$spots, 'Number of spots'; | |
| }; | |
| { | |
| my $spot = JalanSpot::Spot->new( | |
| url => URI->new('http://www.jalan.net/ou/oup2000/ouw2001.do?afCd=&rootCd=&screenId=OUW1302&spotId=guide000000164191') | |
| ); | |
| is 'guide000000164191', $spot->id; | |
| is 'キハチ ソフトクリーム あべのHoop店', $spot->name; | |
| is 'きはちそふとくりーむあべのふーぷてん', $spot->kana; | |
| is "近鉄各線あべの橋駅西出口よりHoop方面へ徒歩すぐ", $spot->access; | |
| is "〒545-0052\n大阪府大阪市阿倍野区阿倍野筋1-2-30 Hoop地下1階", $spot->address; | |
| is "営業:11時~21時\n休業:不定", $spot->hours; | |
| is "その他:ソフトクリーム280円、サンデー480円", $spot->price; | |
| is "あり(有料)1時間600円\n700台", $spot->parking; | |
| is_deeply [ | |
| { | |
| url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_1.jpg', | |
| caption => '近くには、立食用のテーブルもあって便利!', | |
| }, | |
| { | |
| url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_2.jpg', | |
| caption => '牛乳をはじめ、素材は身体にいいものを厳選', | |
| } | |
| ], $spot->pictures; | |
| is '34.645125', $spot->lat; | |
| is '135.513756', $spot->lng; | |
| } | |
| done_testing; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package JalanSpot; | |
| use strict; | |
| use warnings; | |
| use URI; | |
| use Web::Scraper; | |
| use Encode qw/encode/; | |
| use JalanSpot::Spot; | |
| sub new { bless {}, shift } | |
| sub list_spots { | |
| my ($self,$pref,$larea) = @_; | |
| my $ret = []; | |
| my $page = 1; | |
| while(1) { | |
| warn $page; | |
| my $r = $self->scrape_spots($pref,$larea,$page++); | |
| return $ret if !@$r || ( @$ret && $ret->[0]->url eq $r->[0]->url ); | |
| $ret = [ @$ret, @$r ]; | |
| } | |
| } | |
| sub scrape_spots { | |
| my ($self,$pref,$larea,$page) = @_; | |
| my $url = sprintf('http://www.jalan.net/ou/oup1300/ouw1302.do?kenCd=%s&lrgCd=%s&afCd=&rootCd=&genreSeeFlg=1&genreEatFlg=0&genreOtherFlg=0&idx=%d',$pref,$larea,$page||1); | |
| warn $url; | |
| my $ret = []; | |
| my $scr = scraper { | |
| process 'p.title', 'spots[]' => scraper { | |
| process 'a', 'url' => '@href', | |
| process 'a', 'name' => 'TEXT', | |
| }; | |
| }; | |
| my $res = $scr->scrape( URI->new($url) ); | |
| foreach(@{ $res->{spots} }) { | |
| my $name = encode( 'utf8', $_->{name} ); | |
| push @$ret, JalanSpot::Spot->new( name => $name, url => $_->{url} ); | |
| } | |
| $ret; | |
| } | |
| 1; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package JalanSpot::Spot; | |
| use strict; | |
| use warnings; | |
| use Data::Dumper; | |
| use Encode qw/encode/; | |
| use Web::Scraper; | |
| use HTTP::Request::Common; | |
| use LWP::UserAgent; | |
| use Geo::Coordinates::Converter; | |
| use Unicode::Japanese; | |
| sub new { | |
| my $class = shift; | |
| bless { @_ }, $class; | |
| } | |
| sub url { $_[0]->{url} } | |
| sub map_url { URI->new( sprintf('http://www.jalan.net/ou/oup2000/ouw2003.do?spotId=%s&odkType=1',$_[0]->id) ) } | |
| sub id { | |
| my $self = shift; | |
| my %q = $self->url->query_form; | |
| $q{spotId} | |
| } | |
| sub access { $_[0]->{access} ||= $_[0]->scrape->{access} } | |
| sub address { $_[0]->{address} ||= $_[0]->scrape->{address} } | |
| sub contact { $_[0]->{contact} ||= $_[0]->scrape->{contact} } | |
| sub site_url { $_[0]->{site_url} ||= $_[0]->scrape->{site_url} } | |
| sub hours { $_[0]->{hours} ||= $_[0]->scrape->{hours} } | |
| sub kana { $_[0]->{kana} ||= $_[0]->scrape->{kana} } | |
| sub name { $_[0]->{name} ||= $_[0]->scrape->{name} } | |
| sub parking { $_[0]->{parking} ||= $_[0]->scrape->{parking} } | |
| sub price { $_[0]->{price} ||= $_[0]->scrape->{price} } | |
| sub pictures { $_[0]->{pictures} ||= $_[0]->scrape->{pictures} } | |
| sub lat { $_[0]->{lat} ||= $_[0]->scrape_map->{lat} } | |
| sub lng { $_[0]->{lng} ||= $_[0]->scrape_map->{lng} } | |
| sub scrape { | |
| my $self = shift; | |
| return $self if $self->{_scraped}; | |
| die 'No url' unless $self->url; | |
| my $scr = scraper { | |
| process 'div.info > dl', 'infos[]' => scraper { | |
| process 'dt', dt => 'TEXT', | |
| process 'dd', dd => 'HTML', | |
| }, | |
| process '#main-block-01 > p.text', 'texts[]' => 'TEXT', | |
| process 'p.photo-01 > img', 'pictures[]' => '@src', | |
| }; | |
| my $res = $scr->scrape($self->url); | |
| my ($name,@captions); | |
| foreach( @{ $res->{infos} } ) { | |
| my ( $dt, $dd ) = ( encode('utf8',$_->{dt}), encode('utf8',$_->{dd}) ); | |
| $dt =~ s/[\t\n\s\r]//g; | |
| $dd =~ s/[\t\n\s\r]//g; | |
| $dd =~ s%<br\/?>%\n%ig; | |
| $dd =~ s%<[^>]+>%%g; | |
| $dd =~ s%\n\n%\n%ig; | |
| $dd =~ s%\n$%%; | |
| $name = $dd if $dt eq '名称'; | |
| $self->{access} = $dd if $dt eq '交通アクセス'; | |
| $self->{address} = $dd if $dt eq '所在地'; | |
| $self->{contact} = $dd if $dt eq 'お問合わせ'; | |
| $self->{site_url} = $dd if $dt eq 'HP'; | |
| $self->{hours} = $dd if $dt eq '営業期間'; | |
| $self->{parking} = $dd if $dt eq '駐車場'; | |
| $self->{price} = $dd if $dt eq '料金'; | |
| @captions = split /\n/, $dd if $dt eq '写真キャプション'; | |
| } | |
| my $pictures = []; | |
| foreach(@{ $res->{pictures} }) { | |
| ( my $caption = shift(@captions) || '' ) =~ s/写真(:?上|下)\s*:\s*//; | |
| push @$pictures, { | |
| url => $_->as_string, | |
| caption => $caption, | |
| }; | |
| } | |
| $self->{pictures} = $pictures; | |
| my $text = ''; | |
| $text .= $_ foreach(@{ $res->{texts} }); | |
| $text = encode('utf8',$text); | |
| $text =~ s/<[^>]+>//g; | |
| $text =~ s/[\n\r\t\s]//g; | |
| $self->{text} = $text; | |
| if($name =~ m%(.+)\(([^\)]+)\)%) { | |
| $self->{name} = $1; | |
| $self->{kana} = Unicode::Japanese->new($2)->kata2hira->get; | |
| } | |
| $self->{_scraped} = 1; | |
| $self; | |
| } | |
| sub scrape_map { | |
| my $self = shift; | |
| my $res = LWP::UserAgent->new->request( GET $self->map_url->as_string ); | |
| my $content = $res->content; | |
| my ($x, $y); | |
| $x = $1+$2/60+"$3.$4"/3600 if $content =~ m/var px = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/; | |
| $y = $1+$2/60+"$3.$4"/3600 if $content =~ m/var py = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/; | |
| return $self unless $x && $y; | |
| my $g = Geo::Coordinates::Converter->new( lat => sprintf("%.20f",$y), lng => sprintf("%.20f",$x), datum => 'tokyo', format => 'degree' ); | |
| my $pt = $g->convert( 'wgs84' ); | |
| $self->{lat} = $g->lat; | |
| $self->{lng} = $g->lng; | |
| $self; | |
| } | |
| 1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment