ngs · October 31, 2010 08:38
diff --git a/01_jalanspot.t b/01_jalanspot.t
 use strict;
 use warnings;
 use FindBin;
 use lib "$FindBin::RealBin/../lib";
 use lib "$FindBin::RealBin/../extlib";
 use Data::Dumper;
 use Test::More;
 use URI;

 BEGIN { use_ok 'JalanSpot' }
 BEGIN { use_ok 'JalanSpot::Spot' }

 my $s = JalanSpot->new;

 SKIP: {
    skip 1,1;
    my $spots = $s->list_spots('010000','012900');
    print Dumper $spots;
    is 156, @$spots, 'Number of spots';
 };
 {
    my $spot  = JalanSpot::Spot->new(
        url  => URI->new('http://www.jalan.net/ou/oup2000/ouw2001.do?afCd=&rootCd=&screenId=OUW1302&spotId=guide000000164191')
    );
    is 'guide000000164191', $spot->id;
    is 'キハチ　ソフトクリーム　あべのHoop店', $spot->name;
    is 'きはちそふとくりーむあべのふーぷてん', $spot->kana;
    is "近鉄各線あべの橋駅西出口よりHoop方面へ徒歩すぐ", $spot->access;
    is "〒545-0052\n大阪府大阪市阿倍野区阿倍野筋1-2-30　Hoop地下1階", $spot->address;
    is "営業:11時～21時\n休業:不定", $spot->hours;
    is "その他:ソフトクリーム280円、サンデー480円", $spot->price;
    is "あり（有料）1時間600円\n700台", $spot->parking;
    is_deeply [
        {
            url     => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_1.jpg',
            caption => '近くには、立食用のテーブルもあって便利！',
        },
        {
            url     => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_2.jpg',
            caption => '牛乳をはじめ、素材は身体にいいものを厳選',
        }
    ], $spot->pictures;
    is '34.645125',  $spot->lat;
    is '135.513756', $spot->lng;
 }

 done_testing;
diff --git a/JalanSpot.pm b/JalanSpot.pm
 package JalanSpot;
 use strict;
 use warnings;
 use URI;
 use Web::Scraper;
 use Encode qw/encode/;
 use JalanSpot::Spot;

 sub new { bless {}, shift }

 sub list_spots {
    my ($self,$pref,$larea) = @_;
    my $ret = [];
    my $page = 1;
    while(1) {
        warn $page;
        my $r = $self->scrape_spots($pref,$larea,$page++);
        return $ret if !@$r || ( @$ret && $ret->[0]->url eq $r->[0]->url );
        $ret = [ @$ret, @$r ];
    }
 }

 sub scrape_spots {
    my ($self,$pref,$larea,$page) = @_;
    my $url = sprintf('http://www.jalan.net/ou/oup1300/ouw1302.do?kenCd=%s&lrgCd=%s&afCd=&rootCd=&genreSeeFlg=1&genreEatFlg=0&genreOtherFlg=0&idx=%d',$pref,$larea,$page||1);
    warn $url;
    my $ret = [];
    my $scr = scraper {
        process 'p.title', 'spots[]' => scraper {
            process 'a', 'url'  => '@href',
            process 'a', 'name' => 'TEXT',
        };
    };
    my $res = $scr->scrape( URI->new($url) );
    foreach(@{ $res->{spots} }) {
        my $name = encode( 'utf8', $_->{name} );
        push @$ret, JalanSpot::Spot->new( name => $name, url => $_->{url} );
    }
    $ret;
 }

 1;
diff --git a/JalanSpot::Spot.pm b/JalanSpot::Spot.pm
 package JalanSpot::Spot;
 use strict;
 use warnings;
 use Data::Dumper;
 use Encode qw/encode/;
 use Web::Scraper;
 use HTTP::Request::Common;
 use LWP::UserAgent;
 use Geo::Coordinates::Converter;
 use Unicode::Japanese;

 sub new {
    my $class = shift;
    bless { @_ }, $class;
 }

 sub url     { $_[0]->{url}  }
 sub map_url { URI->new( sprintf('http://www.jalan.net/ou/oup2000/ouw2003.do?spotId=%s&odkType=1',$_[0]->id) ) }

 sub id {
    my $self = shift;
    my %q = $self->url->query_form;
    $q{spotId}
 }

 sub access    { $_[0]->{access}   ||= $_[0]->scrape->{access}   }
 sub address   { $_[0]->{address}  ||= $_[0]->scrape->{address}  }
 sub contact   { $_[0]->{contact}  ||= $_[0]->scrape->{contact}  }
 sub site_url  { $_[0]->{site_url} ||= $_[0]->scrape->{site_url} }
 sub hours     { $_[0]->{hours}    ||= $_[0]->scrape->{hours}    }
 sub kana      { $_[0]->{kana}     ||= $_[0]->scrape->{kana}     }
 sub name      { $_[0]->{name}     ||= $_[0]->scrape->{name}     }
 sub parking   { $_[0]->{parking}  ||= $_[0]->scrape->{parking}  }
 sub price     { $_[0]->{price}    ||= $_[0]->scrape->{price}    }
 sub pictures  { $_[0]->{pictures}  ||= $_[0]->scrape->{pictures} }

 sub lat { $_[0]->{lat} ||= $_[0]->scrape_map->{lat} }
 sub lng { $_[0]->{lng} ||= $_[0]->scrape_map->{lng} }

 sub scrape {
    my $self = shift;
    return $self if $self->{_scraped};
    die 'No url' unless $self->url;
    my $scr = scraper {
        process 'div.info > dl', 'infos[]' => scraper {
            process 'dt', dt => 'TEXT',
            process 'dd', dd => 'HTML',
        },
        process '#main-block-01 > p.text', 'texts[]' => 'TEXT',
        process 'p.photo-01 > img', 'pictures[]' => '@src',
    };
    my $res  = $scr->scrape($self->url);
    my ($name,@captions);
    foreach( @{ $res->{infos} } ) {
        my ( $dt, $dd ) = ( encode('utf8',$_->{dt}), encode('utf8',$_->{dd}) );
        $dt =~ s/[\t\n\s\r]//g;
        $dd =~ s/[\t\n\s\r]//g;
        $dd =~ s%<br\/?>%\n%ig;
        $dd =~ s%<[^>]+>%%g;
        $dd =~ s%\n\n%\n%ig;
        $dd =~ s%\n$%%;
        $name = $dd  if $dt eq '名称';
        $self->{access}   = $dd if $dt eq '交通アクセス';
        $self->{address}  = $dd if $dt eq '所在地';
        $self->{contact}  = $dd if $dt eq 'お問合わせ';
        $self->{site_url} = $dd if $dt eq 'HP';
        $self->{hours}    = $dd if $dt eq '営業期間';
        $self->{parking}  = $dd if $dt eq '駐車場';
        $self->{price}    = $dd if $dt eq '料金';
        @captions = split /\n/, $dd if $dt eq '写真キャプション';
    }
    my $pictures = [];
    foreach(@{ $res->{pictures} }) {
        ( my $caption = shift(@captions) || '' ) =~ s/写真(:?上|下)\s*：\s*//;
        push @$pictures, {
            url => $_->as_string,
            caption => $caption,
        };
    }
    $self->{pictures} = $pictures;
    my $text = '';
    $text .= $_ foreach(@{ $res->{texts} });
    $text = encode('utf8',$text);
    $text =~ s/<[^>]+>//g;
    $text =~ s/[\n\r\t\s]//g;
    $self->{text} = $text;
    if($name =~ m%(.+)\(([^\)]+)\)%) {
        $self->{name} = $1;
        $self->{kana} = Unicode::Japanese->new($2)->kata2hira->get;
    }
    $self->{_scraped} = 1;
    $self;
 }

 sub scrape_map {
    my $self = shift;
    my $res = LWP::UserAgent->new->request( GET $self->map_url->as_string );
    my $content = $res->content;
    my ($x, $y);
    $x = $1+$2/60+"$3.$4"/3600 if $content =~ m/var px = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
    $y = $1+$2/60+"$3.$4"/3600 if $content =~ m/var py = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
    return $self unless $x && $y;
    my $g = Geo::Coordinates::Converter->new( lat => sprintf("%.20f",$y), lng => sprintf("%.20f",$x), datum => 'tokyo', format => 'degree' );
    my $pt = $g->convert( 'wgs84' );
    $self->{lat} = $g->lat;
    $self->{lng} = $g->lng;
    $self;
 }

 1;
	use strict;
	use warnings;
	use FindBin;
	use lib "$FindBin::RealBin/../lib";
	use lib "$FindBin::RealBin/../extlib";
	use Data::Dumper;
	use Test::More;
	use URI;

	BEGIN { use_ok 'JalanSpot' }
	BEGIN { use_ok 'JalanSpot::Spot' }

	my $s = JalanSpot->new;

	SKIP: {
	skip 1,1;
	my $spots = $s->list_spots('010000','012900');
	print Dumper $spots;
	is 156, @$spots, 'Number of spots';
	};
	{
	my $spot = JalanSpot::Spot->new(
	url => URI->new('http://www.jalan.net/ou/oup2000/ouw2001.do?afCd=&rootCd=&screenId=OUW1302&spotId=guide000000164191')
	);
	is 'guide000000164191', $spot->id;
	is 'キハチ　ソフトクリーム　あべのHoop店', $spot->name;
	is 'きはちそふとくりーむあべのふーぷてん', $spot->kana;
	is "近鉄各線あべの橋駅西出口よりHoop方面へ徒歩すぐ", $spot->access;
	is "〒545-0052\n大阪府大阪市阿倍野区阿倍野筋1-2-30　Hoop地下1階", $spot->address;
	is "営業:11時～21時\n休業:不定", $spot->hours;
	is "その他:ソフトクリーム280円、サンデー480円", $spot->price;
	is "あり（有料）1時間600円\n700台", $spot->parking;
	is_deeply [
	{
	url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_1.jpg',
	caption => '近くには、立食用のテーブルもあって便利！',
	},
	{
	url => 'http://www.jalan.net/jalan/img/4/spot/0164/XL/guide000000164191_2.jpg',
	caption => '牛乳をはじめ、素材は身体にいいものを厳選',
	}
	], $spot->pictures;
	is '34.645125', $spot->lat;
	is '135.513756', $spot->lng;
	}

	done_testing;
	package JalanSpot;
	use strict;
	use warnings;
	use URI;
	use Web::Scraper;
	use Encode qw/encode/;
	use JalanSpot::Spot;

	sub new { bless {}, shift }

	sub list_spots {
	my ($self,$pref,$larea) = @_;
	my $ret = [];
	my $page = 1;
	while(1) {
	warn $page;
	my $r = $self->scrape_spots($pref,$larea,$page++);
	return $ret if !@$r \|\| ( @$ret && $ret->[0]->url eq $r->[0]->url );
	$ret = [ @$ret, @$r ];
	}
	}

	sub scrape_spots {
	my ($self,$pref,$larea,$page) = @_;
	my $url = sprintf('http://www.jalan.net/ou/oup1300/ouw1302.do?kenCd=%s&lrgCd=%s&afCd=&rootCd=&genreSeeFlg=1&genreEatFlg=0&genreOtherFlg=0&idx=%d',$pref,$larea,$page\|\|1);
	warn $url;
	my $ret = [];
	my $scr = scraper {
	process 'p.title', 'spots[]' => scraper {
	process 'a', 'url' => '@href',
	process 'a', 'name' => 'TEXT',
	};
	};
	my $res = $scr->scrape( URI->new($url) );
	foreach(@{ $res->{spots} }) {
	my $name = encode( 'utf8', $_->{name} );
	push @$ret, JalanSpot::Spot->new( name => $name, url => $_->{url} );
	}
	$ret;
	}

	1;
	package JalanSpot::Spot;
	use strict;
	use warnings;
	use Data::Dumper;
	use Encode qw/encode/;
	use Web::Scraper;
	use HTTP::Request::Common;
	use LWP::UserAgent;
	use Geo::Coordinates::Converter;
	use Unicode::Japanese;

	sub new {
	my $class = shift;
	bless { @_ }, $class;
	}

	sub url { $_[0]->{url} }
	sub map_url { URI->new( sprintf('http://www.jalan.net/ou/oup2000/ouw2003.do?spotId=%s&odkType=1',$_[0]->id) ) }

	sub id {
	my $self = shift;
	my %q = $self->url->query_form;
	$q{spotId}
	}

	sub access { $_[0]->{access} \|\|= $_[0]->scrape->{access} }
	sub address { $_[0]->{address} \|\|= $_[0]->scrape->{address} }
	sub contact { $_[0]->{contact} \|\|= $_[0]->scrape->{contact} }
	sub site_url { $_[0]->{site_url} \|\|= $_[0]->scrape->{site_url} }
	sub hours { $_[0]->{hours} \|\|= $_[0]->scrape->{hours} }
	sub kana { $_[0]->{kana} \|\|= $_[0]->scrape->{kana} }
	sub name { $_[0]->{name} \|\|= $_[0]->scrape->{name} }
	sub parking { $_[0]->{parking} \|\|= $_[0]->scrape->{parking} }
	sub price { $_[0]->{price} \|\|= $_[0]->scrape->{price} }
	sub pictures { $_[0]->{pictures} \|\|= $_[0]->scrape->{pictures} }

	sub lat { $_[0]->{lat} \|\|= $_[0]->scrape_map->{lat} }
	sub lng { $_[0]->{lng} \|\|= $_[0]->scrape_map->{lng} }

	sub scrape {
	my $self = shift;
	return $self if $self->{_scraped};
	die 'No url' unless $self->url;
	my $scr = scraper {
	process 'div.info > dl', 'infos[]' => scraper {
	process 'dt', dt => 'TEXT',
	process 'dd', dd => 'HTML',
	},
	process '#main-block-01 > p.text', 'texts[]' => 'TEXT',
	process 'p.photo-01 > img', 'pictures[]' => '@src',
	};
	my $res = $scr->scrape($self->url);
	my ($name,@captions);
	foreach( @{ $res->{infos} } ) {
	my ( $dt, $dd ) = ( encode('utf8',$_->{dt}), encode('utf8',$_->{dd}) );
	$dt =~ s/[\t\n\s\r]//g;
	$dd =~ s/[\t\n\s\r]//g;
	$dd =~ s%<br\/?>%\n%ig;
	$dd =~ s%<[^>]+>%%g;
	$dd =~ s%\n\n%\n%ig;
	$dd =~ s%\n$%%;
	$name = $dd if $dt eq '名称';
	$self->{access} = $dd if $dt eq '交通アクセス';
	$self->{address} = $dd if $dt eq '所在地';
	$self->{contact} = $dd if $dt eq 'お問合わせ';
	$self->{site_url} = $dd if $dt eq 'HP';
	$self->{hours} = $dd if $dt eq '営業期間';
	$self->{parking} = $dd if $dt eq '駐車場';
	$self->{price} = $dd if $dt eq '料金';
	@captions = split /\n/, $dd if $dt eq '写真キャプション';
	}
	my $pictures = [];
	foreach(@{ $res->{pictures} }) {
	( my $caption = shift(@captions) \|\| '' ) =~ s/写真(:?上\|下)\s：\s//;
	push @$pictures, {
	url => $_->as_string,
	caption => $caption,
	};
	}
	$self->{pictures} = $pictures;
	my $text = '';
	$text .= $_ foreach(@{ $res->{texts} });
	$text = encode('utf8',$text);
	$text =~ s/<[^>]+>//g;
	$text =~ s/[\n\r\t\s]//g;
	$self->{text} = $text;
	if($name =~ m%(.+)\(([^\)]+)\)%) {
	$self->{name} = $1;
	$self->{kana} = Unicode::Japanese->new($2)->kata2hira->get;
	}
	$self->{_scraped} = 1;
	$self;
	}

	sub scrape_map {
	my $self = shift;
	my $res = LWP::UserAgent->new->request( GET $self->map_url->as_string );
	my $content = $res->content;
	my ($x, $y);
	$x = $1+$2/60+"$3.$4"/3600 if $content =~ m/var px = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
	$y = $1+$2/60+"$3.$4"/3600 if $content =~ m/var py = "(\d+)\.(\d+)\.(\d+)\.(\d+)\";/;
	return $self unless $x && $y;
	my $g = Geo::Coordinates::Converter->new( lat => sprintf("%.20f",$y), lng => sprintf("%.20f",$x), datum => 'tokyo', format => 'degree' );
	my $pt = $g->convert( 'wgs84' );
	$self->{lat} = $g->lat;
	$self->{lng} = $g->lng;
	$self;
	}

	1;