Created
June 11, 2016 17:16
-
-
Save mtsukamoto/5476b7c0c4c32d88be5547c023a221aa to your computer and use it in GitHub Desktop.
リスト中の食べログURLを取得し「店名」「営業時間」「禁煙・喫煙」などを出力
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use utf8; | |
use Encode; | |
use Web::Scraper; | |
use URI; | |
my ($opts, @urls) = &parse_args(@ARGV); | |
my $scraper = scraper { | |
process '//table[@class="rst-data"]//tr', 'entry[]' => scraper { | |
process '//th', 'key' => 'TEXT'; | |
process '//td', 'value' => 'TEXT'; | |
}; | |
}; | |
my $shops = []; | |
my @fields = @{$opts->{'-f'}}; | |
foreach my $url (@urls) { | |
print "[fetch] $url\n"; | |
my $uri = URI->new($url); | |
my $res = $scraper->scrape($uri); | |
my $shop = { 'URL' => $url, map { $_ => '' } @fields }; | |
$shop->{'URL'} = $url if (exists($shop->{'URL'})); | |
foreach my $entry (@{$res->{'entry'}}) { | |
next unless (exists($shop->{$entry->{'key'}})); | |
$shop->{$entry->{'key'}} = $entry->{'value'}; | |
} | |
push(@$shops, $shop); | |
} | |
foreach my $shop (@$shops) { | |
my @lines = map { sprintf("[%s] %s", $_, $shop->{$_}) } grep { exists($shop->{$_}) } @fields; | |
my $result = join("\n", @lines); | |
print encode('utf8', "----\n$result\n\n"); | |
} | |
sub parse_args { | |
my @args = @_; | |
# 引数 | |
# ・URLリスト(第一引数、または最終引数) | |
# ・-f 出力フィールド(デフォルトは店名、営業時間、定休日、予算(お店より)、予算(ユーザーより)、禁煙・喫煙、URL) | |
my ($list, $switch, $opts) = (undef, undef, {}); | |
$list = (not @args) ? undef : ($args[0] !~ /^-/) ? shift(@args) : ($args[-1] !~ /^-/) ? pop(@args) : undef; | |
foreach (@args) { | |
next unless (defined($_) && length($_)); | |
if (/^-/) { | |
$switch = $_; | |
$opts->{$switch} ||= []; | |
} elsif ($switch) { | |
push(@{$opts->{$switch}}, $_); | |
} | |
} | |
$opts->{'-f'} ||= [qw(店名 営業時間 定休日 予算(お店より) 予算(ユーザーより) 禁煙・喫煙 URL)]; | |
die qq(usage: $0 URLLIST [-f FIELD1 FIELD2 ...]) unless ($list); | |
my @urls = (); | |
if (-f $list) { | |
open(my $fh, '<', $list) || die qq(Can't open "$list", $!); | |
push(@urls, <$fh>); | |
close($fh); | |
} elsif ($list =~ /https?:\/\/tabelog.com\//i) { | |
push(@urls, $list); | |
} | |
@urls = grep { /https?:\/\/tabelog.com\//i } map { s/^\s+|\s+$//gs; $_ } @urls; | |
die qq(No tabelog url found in "$list") unless (@urls); | |
return $opts, @urls; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment