Created
December 26, 2008 09:14
-
-
Save fuba/40029 to your computer and use it in GitHub Desktop.
exthtml.pl extracts contents specified by an xpath from web pages. Cookbook in Japanese: http://fuba.jottit.com/exthtml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Encode; | |
use Getopt::Long; | |
use URI; | |
use LWP::UserAgent; | |
use HTTP::Cookies::Guess; | |
use constant CHARSET => 'utf-8'; | |
my $VERSION = '20090121_1'; | |
my $url = pop @ARGV; | |
my ( | |
$xpath, $referer, $cookie, $agent, $nextlink, | |
$depth, $as_xml, $verbose, $procedure, $weight, | |
); | |
my $result = GetOptions( | |
"x|xpath=s" => \$xpath, | |
"e|referer=s" => \$referer, | |
"c|cookie-jar=s" => \$cookie, | |
"a|agent=s" => \$agent, | |
"n|nextlink=s" => \$nextlink, | |
"d|depth=i" => \$depth, | |
"s|as-source" => \$as_xml, | |
"f" => \$verbose, | |
"p|procedure=s" => \$procedure, | |
"w=i" => \$weight, | |
); | |
$depth += 1; | |
unless ($url && $xpath) { | |
die "version $VERSION\nusage: ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR]" | |
." -n [NEXTPAGE_XPATH] -d [NEXTPAGE_DEPTH] -w [NEXTPAGE_SLEEP(sec)]" | |
." -p [PROCEDURE: \$v(scalar value), \$n(HTML::Element object), \$u(URI object)]" | |
." -s -f ] -x [XPATH] [URL]|-"; | |
} | |
my $ua = LWP::UserAgent->new; | |
$ua->cookie_jar(HTTP::Cookies::Guess->create(file => $cookie)) if ($cookie); | |
$ua->agent($agent) if ($agent); | |
my %options_base = ( | |
xpath => decode(CHARSET, $xpath), | |
referer => $referer, | |
ua => $ua, | |
as_xml => $as_xml, | |
nextlink => decode(CHARSET, $nextlink), | |
depth => $depth, | |
verbose => $verbose, | |
procedure => $procedure, | |
weight => $weight || 0, | |
); | |
my @url_list; | |
if ($url eq '-') { | |
while (my $url_line = <>) { | |
chomp $url_line; | |
push @url_list, $url_line; | |
} | |
} | |
else { | |
if ($url =~ /\[\d+\-\d+\]/) { | |
push @url_list, &expand_url($url); | |
} | |
else { | |
push @url_list, $url; | |
} | |
} | |
if (@url_list) { | |
for my $url_line (@url_list) { | |
my %options = %options_base; | |
$options{url} = $url_line; | |
extract(%options); | |
} | |
} | |
exit; | |
sub expand_url { | |
my $exp = shift; | |
my @urls; | |
my $format = '%d'; | |
if ($exp =~ s/\[(\d+)\-(\d+)\]/[NUM]/) { | |
my ($start, $end) = ($1 <= $2) ? ($1, $2) : ($2, $1); | |
if ($start =~ /^0\d/) { | |
$format = '%0'.length($end).'d'; | |
} | |
for my $num ($start..$end) { | |
my $url = $exp; | |
my $numstr = sprintf($format, $num); | |
$url =~ s/\[NUM\]/$numstr/; | |
push @urls, $url; | |
} | |
@urls = map {expand_url($_)} @urls; | |
} | |
else { | |
return $exp; | |
} | |
return @urls; | |
} | |
sub proc { | |
my ($proc, $n, $v, $u) = @_; | |
return eval($proc); | |
} | |
sub extract { | |
my %opt = @_; | |
my $xpath = $opt{xpath}; | |
my $depth = $opt{depth}; | |
my $url = $opt{url}; | |
my $referer = $opt{referer}; | |
my $procedure = $opt{procedure}; | |
my $weight = $opt{weight} || 0; | |
my %hist; | |
while ($url && ($depth--)) { | |
last if ($hist{$url}); | |
$hist{$url} = 1; | |
print "$url\n" if ($opt{verbose}); | |
my $tree = HTML::TreeBuilder::XPath::Remote->new_from_uri( | |
$url, $opt{ua}, $referer | |
); | |
for my $node ($tree->findnodes($opt{xpath})) { | |
print "\t" if ($opt{verbose}); | |
my $value = ($opt{as_xml} && $node->isa('HTML::Element')) | |
? $node->as_XML('<>&"') | |
: $node->getValue."\n"; | |
$value = proc($procedure, $node, $value, URI->new($url)) if ($procedure); | |
print encode(CHARSET, $value); | |
} | |
$referer = $url; | |
$url = ''; | |
if ($opt{nextlink}) { | |
my @urls = | |
grep /^http/, | |
map {$_->getValue} $tree->findnodes($opt{nextlink}); | |
$url = $urls[0] if (@urls); | |
} | |
$tree->delete; | |
sleep $weight; | |
} | |
} | |
package HTML::TreeBuilder::XPath::Remote; | |
use strict; | |
use warnings; | |
use List::Util qw( first ); | |
use Encode; | |
use HTML::TreeBuilder::XPath; | |
use HTML::ResolveLink; | |
use LWP::UserAgent; | |
use HTTP::Request; | |
use HTTP::Response::Encoding; | |
sub new_from_uri { | |
my ($pkg, $uri, $ua, $referer) = @_; | |
my $resolver = HTML::ResolveLink->new( | |
base => $uri, | |
); | |
my $html = $resolver->resolve( | |
$pkg->get($uri, $ua, $referer) | |
); | |
return HTML::TreeBuilder::XPath->new_from_content($html); | |
} | |
sub get { | |
my ($self, $uri, $ua, $referer) = @_; | |
my $html; | |
$ua ||= LWP::UserAgent->new(); | |
my $req = HTTP::Request->new('GET', $uri); | |
$req->header(referer => $referer) if ($referer); | |
my $res = $ua->request($req); | |
# this detection is based on Web::Scraper. | |
if ($res->is_success) { | |
my @encoding = ( | |
$res->encoding, | |
($res->header('Content-Type') =~ /charset=([\w\-]+)/g), | |
); | |
if (eval {require Encode::Detect;}) { | |
push @encoding, "Detect"; | |
} | |
push @encoding, "shift-jis"; | |
my $encoding = first { | |
defined $_ && Encode::find_encoding($_) | |
} @encoding; | |
$html = Encode::decode($encoding, $res->content); | |
} | |
return $html; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment