Skip to content

Instantly share code, notes, and snippets.

@JEEN
Created May 13, 2010 07:01
Show Gist options
  • Save JEEN/399580 to your computer and use it in GitHub Desktop.
Save JEEN/399580 to your computer and use it in GitHub Desktop.
use LWP::UserAgent;
use Web::Scraper;
my $ua = LWP::UserAgent->new( agent => 'Dummy Lee Jongs' );
my $res = $ua->get('http://....');
die "WTF?" unless $res->is_success;
my $scraper = scraper {
process 'div[id="blahblah"]>table+table>tr>td>table', 'item' => scraper {
process 'tr>td>table>tr>td+td', 'title' => [ 'TEXT', sub { s/yongbinize/saillinuxize/; $_ } ];
process 'tr:nth-child(3)>td>table>tr>td+td', 'qty' => 'TEXT';
process 'tr:nth-child(5)>td>table>tr+tr>td+td', 'detail' => [ 'HTML', 'STRIP' ];
process 'tr:nth-child(5)>td>table>tr:nth-child(3)>td+td', 'comment' => [ 'HTML', 'STRIP' ];
process 'tr:nth-child(5)>td>table>tr:nth-child(4)>td+td', 'question' => [ 'HTML', 'STRIP' ];
process 'tr:nth-child(5)>td>table>tr:nth-child(5)>td+td', 'address' => 'TEXT';
process 'tr:nth-child(5)>td>table>tr:nth-child(6)>td+td', 'limit_date' => 'TEXT';
process 'tr:nth-child(5)>td>table>tr:nth-child(6)>td:nth-child(4)', 'deliv_date' => 'TEXT';
process 'tr:nth-child(5)>td>table>tr:nth-child(7)>td+td', 'budget' => 'TEXT';
process 'tr:nth-child(7)>td>table>tr+tr>td+td', 'company_detail' => [ 'HTML', 'STRIP' ];
};
result 'item';
}->scrape($res->content);
do_something($scraper);
sub do_something { ...; }
package Web::Scraper::Filter::STRIP;
use base qw( Web::Scraper::Filter );
sub filter {
my ($self, $val) = @_;
...;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment