Skip to content

Instantly share code, notes, and snippets.

@ngs
Created October 31, 2010 08:38
Show Gist options
  • Select an option

  • Save ngs/656308 to your computer and use it in GitHub Desktop.

Select an option

Save ngs/656308 to your computer and use it in GitHub Desktop.
じゃらん 観光ガイドをすクレープする
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../lib";
use lib "$FindBin::RealBin/../extlib";
use Data::Dumper;
use Test::More;
use URI;
BEGIN { use_ok 'JalanSpot' }
BEGIN { use_ok 'JalanSpot::Spot' }
my $s = JalanSpot->new;
SKIP: {
skip 1,1;
my $spots = $s->list_spots('010000','012900');
print Dumper $spots;
is 156, @$spots, 'Number of spots';
};
{
my $spot = JalanSpot::Spot->new(
url => URI->new('http://www.jalan.net/ou/oup2000/ouw2001.do?afCd=&rootCd=&screenId=OUW1302&spotId=guide000000164191')
);
is 'guide000000164191', $spot->id;
is 'キハチ ソフトクリーム あべのHoop店', $spot->name;
is 'きはちそふとくりーむあべのふーぷてん', $spot->kana;
is "近鉄各線あべの橋駅西出口よりHoop方面へ徒歩すぐ", $spot->access;
is "〒545-0052\n大阪府大阪市阿倍野区阿倍野筋1-2-30 Hoop地下1階", $spot->address;
is "営業:11時~21時\n休業:不定", $spot->hours;
is "その他:ソフトクリーム280円、サンデー480円", $spot->price;
is "あり(有料)1時間600円\n700台", $spot->parking;
is_deeply [
{
url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_1.jpg',
caption => '近くには、立食用のテーブルもあって便利!',
},
{
url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_2.jpg',
caption => '牛乳をはじめ、素材は身体にいいものを厳選',
}
], $spot->pictures;
is '34.645125', $spot->lat;
is '135.513756', $spot->lng;
}
done_testing;
package JalanSpot;
use strict;
use warnings;
use URI;
use Web::Scraper;
use Encode qw/encode/;
use JalanSpot::Spot;
sub new { bless {}, shift }
sub list_spots {
my ($self,$pref,$larea) = @_;
my $ret = [];
my $page = 1;
while(1) {
warn $page;
my $r = $self->scrape_spots($pref,$larea,$page++);
return $ret if !@$r || ( @$ret && $ret->[0]->url eq $r->[0]->url );
$ret = [ @$ret, @$r ];
}
}
sub scrape_spots {
my ($self,$pref,$larea,$page) = @_;
my $url = sprintf('http://www.jalan.net/ou/oup1300/ouw1302.do?kenCd=%s&lrgCd=%s&afCd=&rootCd=&genreSeeFlg=1&genreEatFlg=0&genreOtherFlg=0&idx=%d',$pref,$larea,$page||1);
warn $url;
my $ret = [];
my $scr = scraper {
process 'p.title', 'spots[]' => scraper {
process 'a', 'url' => '@href',
process 'a', 'name' => 'TEXT',
};
};
my $res = $scr->scrape( URI->new($url) );
foreach(@{ $res->{spots} }) {
my $name = encode( 'utf8', $_->{name} );
push @$ret, JalanSpot::Spot->new( name => $name, url => $_->{url} );
}
$ret;
}
1;
package JalanSpot::Spot;
use strict;
use warnings;
use Data::Dumper;
use Encode qw/encode/;
use Web::Scraper;
use HTTP::Request::Common;
use LWP::UserAgent;
use Geo::Coordinates::Converter;
use Unicode::Japanese;
sub new {
my $class = shift;
bless { @_ }, $class;
}
sub url { $_[0]->{url} }
sub map_url { URI->new( sprintf('http://www.jalan.net/ou/oup2000/ouw2003.do?spotId=%s&odkType=1',$_[0]->id) ) }
sub id {
my $self = shift;
my %q = $self->url->query_form;
$q{spotId}
}
sub access { $_[0]->{access} ||= $_[0]->scrape->{access} }
sub address { $_[0]->{address} ||= $_[0]->scrape->{address} }
sub contact { $_[0]->{contact} ||= $_[0]->scrape->{contact} }
sub site_url { $_[0]->{site_url} ||= $_[0]->scrape->{site_url} }
sub hours { $_[0]->{hours} ||= $_[0]->scrape->{hours} }
sub kana { $_[0]->{kana} ||= $_[0]->scrape->{kana} }
sub name { $_[0]->{name} ||= $_[0]->scrape->{name} }
sub parking { $_[0]->{parking} ||= $_[0]->scrape->{parking} }
sub price { $_[0]->{price} ||= $_[0]->scrape->{price} }
sub pictures { $_[0]->{pictures} ||= $_[0]->scrape->{pictures} }
sub lat { $_[0]->{lat} ||= $_[0]->scrape_map->{lat} }
sub lng { $_[0]->{lng} ||= $_[0]->scrape_map->{lng} }
sub scrape {
my $self = shift;
return $self if $self->{_scraped};
die 'No url' unless $self->url;
my $scr = scraper {
process 'div.info > dl', 'infos[]' => scraper {
process 'dt', dt => 'TEXT',
process 'dd', dd => 'HTML',
},
process '#main-block-01 > p.text', 'texts[]' => 'TEXT',
process 'p.photo-01 > img', 'pictures[]' => '@src',
};
my $res = $scr->scrape($self->url);
my ($name,@captions);
foreach( @{ $res->{infos} } ) {
my ( $dt, $dd ) = ( encode('utf8',$_->{dt}), encode('utf8',$_->{dd}) );
$dt =~ s/[\t\n\s\r]//g;
$dd =~ s/[\t\n\s\r]//g;
$dd =~ s%<br\/?>%\n%ig;
$dd =~ s%<[^>]+>%%g;
$dd =~ s%\n\n%\n%ig;
$dd =~ s%\n$%%;
$name = $dd if $dt eq '名称';
$self->{access} = $dd if $dt eq '交通アクセス';
$self->{address} = $dd if $dt eq '所在地';
$self->{contact} = $dd if $dt eq 'お問合わせ';
$self->{site_url} = $dd if $dt eq 'HP';
$self->{hours} = $dd if $dt eq '営業期間';
$self->{parking} = $dd if $dt eq '駐車場';
$self->{price} = $dd if $dt eq '料金';
@captions = split /\n/, $dd if $dt eq '写真キャプション';
}
my $pictures = [];
foreach(@{ $res->{pictures} }) {
( my $caption = shift(@captions) || '' ) =~ s/写真(:?上|下)\s*:\s*//;
push @$pictures, {
url => $_->as_string,
caption => $caption,
};
}
$self->{pictures} = $pictures;
my $text = '';
$text .= $_ foreach(@{ $res->{texts} });
$text = encode('utf8',$text);
$text =~ s/<[^>]+>//g;
$text =~ s/[\n\r\t\s]//g;
$self->{text} = $text;
if($name =~ m%(.+)\(([^\)]+)\)%) {
$self->{name} = $1;
$self->{kana} = Unicode::Japanese->new($2)->kata2hira->get;
}
$self->{_scraped} = 1;
$self;
}
sub scrape_map {
my $self = shift;
my $res = LWP::UserAgent->new->request( GET $self->map_url->as_string );
my $content = $res->content;
my ($x, $y);
$x = $1+$2/60+"$3.$4"/3600 if $content =~ m/var px = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
$y = $1+$2/60+"$3.$4"/3600 if $content =~ m/var py = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
return $self unless $x && $y;
my $g = Geo::Coordinates::Converter->new( lat => sprintf("%.20f",$y), lng => sprintf("%.20f",$x), datum => 'tokyo', format => 'degree' );
my $pt = $g->convert( 'wgs84' );
$self->{lat} = $g->lat;
$self->{lng} = $g->lng;
$self;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment