Skip to content

Instantly share code, notes, and snippets.

@keiya
Created July 26, 2012 08:33
Show Gist options
  • Select an option

  • Save keiya/3180993 to your computer and use it in GitHub Desktop.

Select an option

Save keiya/3180993 to your computer and use it in GitHub Desktop.
f-navigation.jp scraping script
#!/usr/bin/perl
#
# by Keiya Chinen
use strict;
use Web::Scraper;
use URI;
use Data::Dumper;
use Time::Piece;
use Text::CSV_XS;
use LWP::UserAgent;
use JSON::XS;
use utf8;
binmode STDOUT, ":utf8";
$Data::Dumper::Indent =1;
#$Data::Dumper::Purity = 1;
#use Clone qw(clone);
$| = 1;
#my $dbh = DBI->connect("dbi:SQLite:dbname=./animaizm.sqlite");
#create_table();
my %fetched;
my $startpage = 'http://f-navigation.jp/pages/lists/category/';
my $baseurl = 'http://f-navigation.jp';
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
my $csv = Text::CSV_XS->new({binary => 1});
my $data;
my $pages_by_category = scrape_categories_list($startpage);
my $i = 0;
foreach my $lc ($pages_by_category->{largecategory}) {
foreach my $c (@$lc) {
foreach my $pages ($c->{category}) {
foreach my $page (@$pages) {
# data structure:
#'link' => '/pages/lists/category/160/1/',
#'scategory' => "> アート/人文科学",
#'lcategory' => "その他"
for (my $res = scrape_pages_list($baseurl.$page->{link});$res != 0 && $res->{next};$res = scrape_pages_list($res->{next})) {
foreach my $list (@{$res->{list}}) {
$list->as_string =~ m/detail\/(\d+)\/$/ ;
my $fb = graph_facebook($1);
if ($fb != 0) {
push @$data, [
$page->{'lcategory'},
$page->{'scategory'},
$fb->{'name'},
$fb->{'link'},
$fb->{'website'}
];
}
}
}
}
}
}
}
open my $fh , ">save.csv";
binmode($fh, ":utf8");
for my $row (@$data) {
$csv->combine(@$row) or die $csv->error_diag();
print $fh $csv->string() , "\n";
}
close $fh;
#foreach my $lc ($pages_by_category{'largecategory'}) {
#}
sub graph_facebook {
my $url = 'https://graph.facebook.com/'.$1;
print "[graph_facebook] $url ...";
my $res = $ua->get($url);
if ($res->is_success) {
print "success\n";
return decode_json($res->decoded_content);
}
elsif ($_[1] != 1) {
print "failed. retrying:\n";
return graph_facebook($_[0],1);
}
else {
print "failed.\n";
return 0;
}
}
sub print_r {
my $dump = Dumper( $_[0] );
$dump =~ s/\\x{([0-9a-z]+)}/chr(hex($1))/ge;
print $dump;
}
sub scrape_pages_list {
print "[scrape_pages_list] $_[0] ...";
my $scraper = scraper {
process '/html/body/div/section/div/section/section/section/div[2]/div/table/tr/td[3]/h3/a', 'list[]' => '@href',
process '/html/body/div/section/div/section/section/section/div[2]/ul/li[@class="next"]/a', 'next' => '@href'
};
my $link = $_[0];
my $uri = new URI($link);
my $res;
eval{$res = $scraper->scrape($uri);};
if ($res->{next}) {
print "next page found.\n";
return $res;
}
else {
print "final page completed.\n";
return 0;
}
}
sub scrape_categories_list {
print "[scrape_categories_list] $_[0] ...";
my $scraper = scraper {
process '/html/body/div/section/div/section/section/table/tr', 'largecategory[]' => scraper {
process '.categoryList' , 'category' => sub {
my @pages = ();
#$hash{$_->{_parent}->{_content}->[0]->as_text} = $_->as_text;
my $key = $_->{_parent}->{_content}->[0]->as_text;
foreach my $part ($_->find("a")) {
my $val = $part->as_text;
my $href = $part->{href};
my $hash = {'lcategory'=>$key,'scategory'=>$val,'link'=>$href};
push(@pages,$hash);
}
return \@pages;
}
# my $text = $_->as_text or return;
# if ($_->{_tag} eq 'dt') {
# $before = $text;
# }
# elsif ($_->{_tag} eq 'dd') {
# if ($before eq 'スタッフ') {
# }
# else {
# $hash{$before} = $text;
# }
# }
# return $text;
};
#process '//*[@class="detailDd"]', 'staff[]' => 'HTML';
};
my $link = $_[0];
my $uri = new URI($link);
my $res;
eval{$res = $scraper->scrape($uri);};
print "\n";
return $res;
# return \%hash;
}
sub url_encode {
my $str = shift;
return $str;
}
sub create_table {
my $create_table = "create table sales (" .
"title,".
"director,".
"charades,".
"companey,".
"script,".
"audiodir,".
"actor,".
"UNIQUE(title)".
");";
#$dbh->do($create_table);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment