keiya · July 26, 2012 08:33
diff --git a/fnavi-scrape.pl b/fnavi-scrape.pl
 #!/usr/bin/perl

 # 
 # by Keiya Chinen

 use strict;
 use Web::Scraper;
 use URI;
 use Data::Dumper;
 use Time::Piece;
 use Text::CSV_XS;
 use LWP::UserAgent;
 use JSON::XS;
 use utf8;
 binmode STDOUT, ":utf8"; 

 $Data::Dumper::Indent  =1;
 #$Data::Dumper::Purity = 1;

 #use Clone qw(clone);
 $| = 1;
 #my $dbh = DBI->connect("dbi:SQLite:dbname=./animaizm.sqlite");
 #create_table();

 my %fetched;

 my $startpage = 'http://f-navigation.jp/pages/lists/category/';
 my $baseurl = 'http://f-navigation.jp';

 my $ua = LWP::UserAgent->new;
 $ua->timeout(10);
 $ua->env_proxy;

 my $csv = Text::CSV_XS->new({binary => 1});

 my $data;
 my $pages_by_category = scrape_categories_list($startpage);
 my $i = 0;
 foreach my $lc ($pages_by_category->{largecategory}) {
 	foreach my $c  (@$lc) {
 		foreach my $pages ($c->{category}) {
 			foreach my $page (@$pages) {
 				# data structure:
 				#'link' => '/pages/lists/category/160/1/',
 				#'scategory' => "> アート/人文科学",
 				#'lcategory' => "その他"
 				for (my $res = scrape_pages_list($baseurl.$page->{link});$res != 0 && $res->{next};$res = scrape_pages_list($res->{next})) {
 					foreach my $list (@{$res->{list}}) {
 						$list->as_string =~ m/detail\/(\d+)\/$/ ;
 						my $fb = graph_facebook($1);
 						if ($fb != 0) {
 							push @$data, [
 								$page->{'lcategory'},
 								$page->{'scategory'},
 								$fb->{'name'},
 								$fb->{'link'},
 								$fb->{'website'}
 							];
 						}
 					}
 				}
 			}
 		}
 	}
 }

 open my $fh , ">save.csv";
 binmode($fh, ":utf8");
 for my $row (@$data) {
 	$csv->combine(@$row) or die $csv->error_diag();
 	print $fh $csv->string() , "\n";
 }
 close $fh;

 #foreach my $lc ($pages_by_category{'largecategory'}) {
 #}

 sub graph_facebook {
 	my $url = 'https://graph.facebook.com/'.$1;
 	print "[graph_facebook] $url ...";
 	my $res = $ua->get($url);
 	if ($res->is_success) {
 		print "success\n";
 		return decode_json($res->decoded_content);
 	}
 	elsif ($_[1] != 1) {
 		print "failed. retrying:\n";
 		return graph_facebook($_[0],1);
 	}
 	else {
 		print "failed.\n";
 		return 0;
 	}

 }

 sub print_r {
 	my $dump = Dumper( $_[0] );
 	$dump =~ s/\\x{([0-9a-z]+)}/chr(hex($1))/ge;
 	print $dump;
 }

 sub scrape_pages_list {
 	print "[scrape_pages_list] $_[0] ...";
 	my $scraper = scraper {
 		process '/html/body/div/section/div/section/section/section/div[2]/div/table/tr/td[3]/h3/a', 'list[]' => '@href',
 		process '/html/body/div/section/div/section/section/section/div[2]/ul/li[@class="next"]/a', 'next' => '@href'
 	};
 	my $link = $_[0];
 	my $uri = new URI($link);
 	my $res;
 	eval{$res = $scraper->scrape($uri);};
 	if ($res->{next}) {
 		print "next page found.\n";
 		return $res;
 	}
 	else {
 		print "final page completed.\n";
 		return 0;
 	}
 }

 sub scrape_categories_list {
 	print "[scrape_categories_list] $_[0] ...";
 	my $scraper = scraper {
 		process '/html/body/div/section/div/section/section/table/tr', 'largecategory[]' => scraper {
 			process '.categoryList' , 'category' => sub {
 				my @pages = ();
 				#$hash{$_->{_parent}->{_content}->[0]->as_text} = $_->as_text;
 				my $key = $_->{_parent}->{_content}->[0]->as_text;
 				foreach my $part ($_->find("a")) {
 					my $val = $part->as_text;
 					my $href = $part->{href};
 					my $hash = {'lcategory'=>$key,'scategory'=>$val,'link'=>$href};
 					push(@pages,$hash);
 				}
 				return \@pages;
 			}
 			#	my $text = $_->as_text or return;
 			#	if ($_->{_tag} eq 'dt') {
 			#		$before = $text;
 			#	}
 			#	elsif ($_->{_tag} eq 'dd') {
 			#		if ($before eq 'スタッフ') {
 			#		}
 			#		else {
 			#			$hash{$before} = $text;
 			#		}
 			#	}
 			#	return $text;
 		};
 		#process '//*[@class="detailDd"]', 'staff[]' => 'HTML';
 	};
 	my $link = $_[0];
 	my $uri = new URI($link);
 	my $res;
 	eval{$res = $scraper->scrape($uri);};
 	print "\n";
 	return $res;
 #	return \%hash;
 }

 sub url_encode {
 	my $str = shift;
 	return $str;
 }

 sub create_table {
 my $create_table = "create table sales (" .
                       "title,".
                       "director,".
                       "charades,".
                       "companey,".
                       "script,".
                       "audiodir,".
                       "actor,".
                       "UNIQUE(title)".
                   ");";
 #$dbh->do($create_table);
 }
	#!/usr/bin/perl

	#
	# by Keiya Chinen

	use strict;
	use Web::Scraper;
	use URI;
	use Data::Dumper;
	use Time::Piece;
	use Text::CSV_XS;
	use LWP::UserAgent;
	use JSON::XS;
	use utf8;
	binmode STDOUT, ":utf8";

	$Data::Dumper::Indent =1;
	#$Data::Dumper::Purity = 1;

	#use Clone qw(clone);
	$\| = 1;
	#my $dbh = DBI->connect("dbi:SQLite:dbname=./animaizm.sqlite");
	#create_table();

	my %fetched;

	my $startpage = 'http://f-navigation.jp/pages/lists/category/';
	my $baseurl = 'http://f-navigation.jp';

	my $ua = LWP::UserAgent->new;
	$ua->timeout(10);
	$ua->env_proxy;

	my $csv = Text::CSV_XS->new({binary => 1});

	my $data;
	my $pages_by_category = scrape_categories_list($startpage);
	my $i = 0;
	foreach my $lc ($pages_by_category->{largecategory}) {
	foreach my $c (@$lc) {
	foreach my $pages ($c->{category}) {
	foreach my $page (@$pages) {
	# data structure:
	#'link' => '/pages/lists/category/160/1/',
	#'scategory' => "> アート/人文科学",
	#'lcategory' => "その他"
	for (my $res = scrape_pages_list($baseurl.$page->{link});$res != 0 && $res->{next};$res = scrape_pages_list($res->{next})) {
	foreach my $list (@{$res->{list}}) {
	$list->as_string =~ m/detail\/(\d+)\/$/ ;
	my $fb = graph_facebook($1);
	if ($fb != 0) {
	push @$data, [
	$page->{'lcategory'},
	$page->{'scategory'},
	$fb->{'name'},
	$fb->{'link'},
	$fb->{'website'}
	];
	}
	}
	}
	}
	}
	}
	}

	open my $fh , ">save.csv";
	binmode($fh, ":utf8");
	for my $row (@$data) {
	$csv->combine(@$row) or die $csv->error_diag();
	print $fh $csv->string() , "\n";
	}
	close $fh;

	#foreach my $lc ($pages_by_category{'largecategory'}) {
	#}

	sub graph_facebook {
	my $url = 'https://graph.facebook.com/'.$1;
	print "[graph_facebook] $url ...";
	my $res = $ua->get($url);
	if ($res->is_success) {
	print "success\n";
	return decode_json($res->decoded_content);
	}
	elsif ($_[1] != 1) {
	print "failed. retrying:\n";
	return graph_facebook($_[0],1);
	}
	else {
	print "failed.\n";
	return 0;
	}

	}

	sub print_r {
	my $dump = Dumper( $_[0] );
	$dump =~ s/\\x{([0-9a-z]+)}/chr(hex($1))/ge;
	print $dump;
	}

	sub scrape_pages_list {
	print "[scrape_pages_list] $_[0] ...";
	my $scraper = scraper {
	process '/html/body/div/section/div/section/section/section/div[2]/div/table/tr/td[3]/h3/a', 'list[]' => '@href',
	process '/html/body/div/section/div/section/section/section/div[2]/ul/li[@class="next"]/a', 'next' => '@href'
	};
	my $link = $_[0];
	my $uri = new URI($link);
	my $res;
	eval{$res = $scraper->scrape($uri);};
	if ($res->{next}) {
	print "next page found.\n";
	return $res;
	}
	else {
	print "final page completed.\n";
	return 0;
	}
	}

	sub scrape_categories_list {
	print "[scrape_categories_list] $_[0] ...";
	my $scraper = scraper {
	process '/html/body/div/section/div/section/section/table/tr', 'largecategory[]' => scraper {
	process '.categoryList' , 'category' => sub {
	my @pages = ();
	#$hash{$_->{_parent}->{_content}->[0]->as_text} = $_->as_text;
	my $key = $_->{_parent}->{_content}->[0]->as_text;
	foreach my $part ($_->find("a")) {
	my $val = $part->as_text;
	my $href = $part->{href};
	my $hash = {'lcategory'=>$key,'scategory'=>$val,'link'=>$href};
	push(@pages,$hash);
	}
	return \@pages;
	}
	# my $text = $_->as_text or return;
	# if ($_->{_tag} eq 'dt') {
	# $before = $text;
	# }
	# elsif ($_->{_tag} eq 'dd') {
	# if ($before eq 'スタッフ') {
	# }
	# else {
	# $hash{$before} = $text;
	# }
	# }
	# return $text;
	};
	#process '//*[@class="detailDd"]', 'staff[]' => 'HTML';
	};
	my $link = $_[0];
	my $uri = new URI($link);
	my $res;
	eval{$res = $scraper->scrape($uri);};
	print "\n";
	return $res;
	# return \%hash;
	}

	sub url_encode {
	my $str = shift;
	return $str;
	}

	sub create_table {
	my $create_table = "create table sales (" .
	"title,".
	"director,".
	"charades,".
	"companey,".
	"script,".
	"audiodir,".
	"actor,".
	"UNIQUE(title)".
	");";
	#$dbh->do($create_table);
	}
No results found