Skip to content

Instantly share code, notes, and snippets.

@yongbin
Created February 25, 2011 05:08
Show Gist options
  • Save yongbin/843388 to your computer and use it in GitHub Desktop.
Save yongbin/843388 to your computer and use it in GitHub Desktop.
Simple Web::Scraper and CLI example
#!/usr/bin/env perl
use 5.010;
use utf8;
use strict;
use warnings;
use autodie;
use Getopt::Long::Descriptive;
use URI;
use Readonly;
use Web::Scraper;
use LWP::Simple qw/getstore/;
use File::Spec::Functions;
use File::Path qw/make_path/;
binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
Readonly::Scalar my $URL => 'http://www.himssconference.org/handouts/';
Readonly::Scalar my $SAVE_TO => "./himss2011/";
Readonly::Scalar my $CSS_SELECTOR =>
q/html > body > div#wrapper.clearfix > div#middleColumnWide > table > tr/;
my ( $opt, $usage ) = describe_options(
"%c %o ...",
[
'url|u=s',
sprintf( 'address to parse (default: %s)', $URL ),
{ default => $URL },
],
[
'css|c=s',
sprintf( 'CSS selector (default: %s)', $CSS_SELECTOR ),
{ default => $CSS_SELECTOR },
],
[
'to|t=s',
sprintf( 'target directory (default: %s)', $SAVE_TO ),
{ default => $SAVE_TO },
],
[],
[ 'dryrun|d', 'dry run - just get lists', { default => 0 } ],
[ 'verbose|v', 'print extra stuff', { default => 0 } ],
[ 'help|h', 'print usage message and exit' ],
);
print( $usage->text ), exit if $opt->help;
my $himss = scraper {
process $opt->css, "rows[]" => scraper {
process 'span.sessionid', id => 'TEXT';
process 'span.title', title => 'TEXT';
process 'p.download > a', link => '@href';
};
};
make_path( $opt->to ) unless -e $opt->to;
my $res = $himss->scrape( URI->new( $opt->url ) );
my $n;
foreach
my $row ( grep { exists $_->{id} and exists $_->{link} } @{ $res->{rows} } )
{
my ($id) = $row->{id} =~ m{^Session (.*)$};
my $filename = sprintf( "%s - %s.pdf", $id, $row->{title} );
say sprintf '%3d : save [ %s ]', ++$n, $filename;
getstore( $row->{link}, catfile( $opt->to . $filename ) )
unless $opt->dryrun;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment