Instantly share code, notes, and snippets.
Created
July 26, 2012 08:33
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save keiya/3180993 to your computer and use it in GitHub Desktop.
f-navigation.jp scraping script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| # | |
| # by Keiya Chinen | |
| use strict; | |
| use Web::Scraper; | |
| use URI; | |
| use Data::Dumper; | |
| use Time::Piece; | |
| use Text::CSV_XS; | |
| use LWP::UserAgent; | |
| use JSON::XS; | |
| use utf8; | |
| binmode STDOUT, ":utf8"; | |
| $Data::Dumper::Indent =1; | |
| #$Data::Dumper::Purity = 1; | |
| #use Clone qw(clone); | |
| $| = 1; | |
| #my $dbh = DBI->connect("dbi:SQLite:dbname=./animaizm.sqlite"); | |
| #create_table(); | |
| my %fetched; | |
| my $startpage = 'http://f-navigation.jp/pages/lists/category/'; | |
| my $baseurl = 'http://f-navigation.jp'; | |
| my $ua = LWP::UserAgent->new; | |
| $ua->timeout(10); | |
| $ua->env_proxy; | |
| my $csv = Text::CSV_XS->new({binary => 1}); | |
| my $data; | |
| my $pages_by_category = scrape_categories_list($startpage); | |
| my $i = 0; | |
| foreach my $lc ($pages_by_category->{largecategory}) { | |
| foreach my $c (@$lc) { | |
| foreach my $pages ($c->{category}) { | |
| foreach my $page (@$pages) { | |
| # data structure: | |
| #'link' => '/pages/lists/category/160/1/', | |
| #'scategory' => "> アート/人文科学", | |
| #'lcategory' => "その他" | |
| for (my $res = scrape_pages_list($baseurl.$page->{link});$res != 0 && $res->{next};$res = scrape_pages_list($res->{next})) { | |
| foreach my $list (@{$res->{list}}) { | |
| $list->as_string =~ m/detail\/(\d+)\/$/ ; | |
| my $fb = graph_facebook($1); | |
| if ($fb != 0) { | |
| push @$data, [ | |
| $page->{'lcategory'}, | |
| $page->{'scategory'}, | |
| $fb->{'name'}, | |
| $fb->{'link'}, | |
| $fb->{'website'} | |
| ]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| open my $fh , ">save.csv"; | |
| binmode($fh, ":utf8"); | |
| for my $row (@$data) { | |
| $csv->combine(@$row) or die $csv->error_diag(); | |
| print $fh $csv->string() , "\n"; | |
| } | |
| close $fh; | |
| #foreach my $lc ($pages_by_category{'largecategory'}) { | |
| #} | |
| sub graph_facebook { | |
| my $url = 'https://graph.facebook.com/'.$1; | |
| print "[graph_facebook] $url ..."; | |
| my $res = $ua->get($url); | |
| if ($res->is_success) { | |
| print "success\n"; | |
| return decode_json($res->decoded_content); | |
| } | |
| elsif ($_[1] != 1) { | |
| print "failed. retrying:\n"; | |
| return graph_facebook($_[0],1); | |
| } | |
| else { | |
| print "failed.\n"; | |
| return 0; | |
| } | |
| } | |
| sub print_r { | |
| my $dump = Dumper( $_[0] ); | |
| $dump =~ s/\\x{([0-9a-z]+)}/chr(hex($1))/ge; | |
| print $dump; | |
| } | |
| sub scrape_pages_list { | |
| print "[scrape_pages_list] $_[0] ..."; | |
| my $scraper = scraper { | |
| process '/html/body/div/section/div/section/section/section/div[2]/div/table/tr/td[3]/h3/a', 'list[]' => '@href', | |
| process '/html/body/div/section/div/section/section/section/div[2]/ul/li[@class="next"]/a', 'next' => '@href' | |
| }; | |
| my $link = $_[0]; | |
| my $uri = new URI($link); | |
| my $res; | |
| eval{$res = $scraper->scrape($uri);}; | |
| if ($res->{next}) { | |
| print "next page found.\n"; | |
| return $res; | |
| } | |
| else { | |
| print "final page completed.\n"; | |
| return 0; | |
| } | |
| } | |
| sub scrape_categories_list { | |
| print "[scrape_categories_list] $_[0] ..."; | |
| my $scraper = scraper { | |
| process '/html/body/div/section/div/section/section/table/tr', 'largecategory[]' => scraper { | |
| process '.categoryList' , 'category' => sub { | |
| my @pages = (); | |
| #$hash{$_->{_parent}->{_content}->[0]->as_text} = $_->as_text; | |
| my $key = $_->{_parent}->{_content}->[0]->as_text; | |
| foreach my $part ($_->find("a")) { | |
| my $val = $part->as_text; | |
| my $href = $part->{href}; | |
| my $hash = {'lcategory'=>$key,'scategory'=>$val,'link'=>$href}; | |
| push(@pages,$hash); | |
| } | |
| return \@pages; | |
| } | |
| # my $text = $_->as_text or return; | |
| # if ($_->{_tag} eq 'dt') { | |
| # $before = $text; | |
| # } | |
| # elsif ($_->{_tag} eq 'dd') { | |
| # if ($before eq 'スタッフ') { | |
| # } | |
| # else { | |
| # $hash{$before} = $text; | |
| # } | |
| # } | |
| # return $text; | |
| }; | |
| #process '//*[@class="detailDd"]', 'staff[]' => 'HTML'; | |
| }; | |
| my $link = $_[0]; | |
| my $uri = new URI($link); | |
| my $res; | |
| eval{$res = $scraper->scrape($uri);}; | |
| print "\n"; | |
| return $res; | |
| # return \%hash; | |
| } | |
| sub url_encode { | |
| my $str = shift; | |
| return $str; | |
| } | |
| sub create_table { | |
| my $create_table = "create table sales (" . | |
| "title,". | |
| "director,". | |
| "charades,". | |
| "companey,". | |
| "script,". | |
| "audiodir,". | |
| "actor,". | |
| "UNIQUE(title)". | |
| ");"; | |
| #$dbh->do($create_table); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment