Skip to content

Instantly share code, notes, and snippets.

@EdwardIII
Created April 1, 2011 14:33
Show Gist options
  • Save EdwardIII/898233 to your computer and use it in GitHub Desktop.
Save EdwardIII/898233 to your computer and use it in GitHub Desktop.
package EspressoParts::Scraper;
use Moose;
use WWW::Mechanize;
use HTML::TreeBuilder::XPath;
#ABSTRACT: Scrapes the site for parts
has parts => (
is => 'rw',
isa => 'ArrayRef[HashRef]',
default => sub {[]},
);
has _mech => (
is => 'rw',
isa => 'WWW::Mechanize',
lazy_build => 1,
);
sub _build__mech {
return WWW::Mechanize->new;
}
sub BUILD {
my $self = shift;
$self->_mech->get( 'http://www.example.com/index.php' );
unless($self->_mech->success()) { die "Couldn't access login page, failed with status: " . $self->_mech->status(); }
$self->_mech->submit_form(
form_name => 'auth',
fields => { username => 'myuser', userpass => 'mypass' },
);
unless($self->_mech->success()) { die "Couldn't submit login form, failed with status: " . $self->_mech->status(); }
unless($self->_mech->content =~ /Successful authentication/){ die "Didn't login successfully"; }
$self->_mech->follow_link( text_regex => qr/Continue/i ) ;
unless($self->_mech->success){ die "Couldn't continue" . $self->_mech->status(); }
my $link = $self->_mech->find_link( text_regex => qr/Espresso Parts by Manufacturer/i ) ;
$self->_mech->get($link);
my @manufacturers = $self->_extract_interesting_links;
foreach my $manufacturer (@manufacturers){
$self->_mech->get($manufacturer->{'uri'}) or die "Couldn't get category";
my @categories = $self->_extract_interesting_links;
foreach my $category (@categories) {
$self->_mech->get($category->{'uri'}) or die "Can't get product page";
warn("on this page: " . $category->{'uri'});
my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
my @product_nodes = $tree->findnodes( '//tr/td/font/a[@class=\'aalt4\']' );
foreach my $product_node (@product_nodes){
my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
my @product_info_nodes = $tree->findnodes('//tr/td[@class=\'stdtext\'');
my %product_info;
foreach my $info (@product_info_nodes){
$product_info{'name'} = $product_node->as_text,
$product_info{'reference'} = ($info->as_text =~ /\(Reference\s+(\w+)\)/)[0] || 'N/A';
# use a . to catch pound symbol in order to avoid character encoding pain
$product_info{'price'} = ($info->as_text =~ /Price\s+.(\d+\.\d+)/)[0] || 0.00;
$product_info{'available'} = $info->as_text =~ /Available/ || 0.00;
}
push(@{$self->parts}, \%product_info);
}
}
}
}
sub _extract_interesting_links {
my $self = shift;
my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content)
or die "Couldn't initialise treebuilder";
my @nodes = $tree->findnodes( '//tr/td[@class=\'aalt4\']/a' );
my @links;
foreach (@nodes){
push(@links,
{ name => $_->as_text, uri => $_->attr('href')}
);
}
return @links;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment