Created
July 9, 2014 01:40
-
-
Save hitsujixgit/ad452791446e60ce538e to your computer and use it in GitHub Desktop.
横浜市統計ポータルサイトの年齢別人口データのうち、総人口と0〜5歳各年齢の人口を入手してJSONファイルに書き出します。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /opt/local/bin/perl | |
use strict; | |
use warnings; | |
use 5.012; | |
use Encode; | |
use HTML::TableExtract; | |
use utf8; | |
use LWP::Simple; | |
use JSON qw( decode_json encode_json ); | |
my $url_dir = "http://www.city.yokohama.lg.jp/ex/stat/jinko/age/new/"; | |
my @file_keys = qw(age tsurumi kanagawa nishi naka minami konan hodogaya asahi isogo kanazawa kohoku midori aoba tsuzuki totsuka sakae izumi seya); | |
my $browser = LWP::UserAgent->new(keep_alive=>3, timeout=>30); | |
my @stat; | |
foreach my $key (@file_keys) { | |
# URLフレーズを作成する | |
my $url = $url_dir . $key . "-j.html"; | |
print $url, "\n"; | |
# HTMLソースを入手する | |
my $content = get_content_by_url($url); | |
# HTMLソースから目的の人口データのみ入手する | |
my $res = get_stat_from_html($content); | |
# 結果を入れる配列にpush | |
push(@stat, {'key' => $key, 'stat' => $res, }); | |
# Webサイトへのアクセスするタイミングを1sec開ける | |
sleep(1); | |
} | |
my $json = encode_json(\@stat); | |
open(FILE, ">yokohama_stat.json") or die "Cannot open json file."; | |
print FILE $json; | |
close(FILE); | |
sub get_content_by_url { | |
my $url = shift; | |
my $response = $browser->get( $url ); | |
# 正常に読み込みできたか確認する | |
if ( $response->is_success ) { | |
my $content = decode('Shift_JIS', $response->content) or die "Decode html source code failed."; | |
return $content; | |
} else { | |
die "Get html form web failed."; | |
} | |
} | |
sub get_stat_from_html { | |
my $content = shift; | |
my $te = new HTML::TableExtract(headers => [qw(年齢(歳) 総数)]); | |
$te->parse($content) or die "Parse file by htmlextract failed."; | |
my $ts = ($te->tables)[0]; | |
my %nums; | |
if(defined $ts and defined $ts->rows) { | |
foreach my $row ($ts->rows) { | |
print encode('utf-8', join(": ",@$row)), "\n"; | |
my $age; | |
# 行見出しが0-5歳に該当する場合をそれぞれPickup(見出し数値は全角表記されている) | |
if ($row->[0] eq '総数' ) { | |
$age = "TTL"; | |
} elsif ($row->[0] eq '0' ) { | |
$age = 0; | |
} elsif ( $row->[0] eq '1' ) { | |
$age = 1; | |
} elsif ( $row->[0] eq '2' ) { | |
$age = 2; | |
} elsif ( $row->[0] eq '3' ) { | |
$age = 3; | |
} elsif ( $row->[0] eq '4' ) { | |
$age = 4; | |
} elsif ( $row->[0] eq '5' ) { | |
$age = 5; | |
} else { | |
next; | |
} | |
# 桁区切り文字を取り除いて、再度文字列として連結した後で数値に変換。Validationも兼ねて行う | |
my $num = join('', split(/,/,$row->[1])) + 0 or die "The population number invalid."; | |
# 結果をhashに追加 | |
$nums{$age.""} = $num; | |
} | |
} | |
return \%nums; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment