EdwardIII · April 1, 2011 14:33
diff --git a/gistfile1.txt b/gistfile1.txt
 package EspressoParts::Scraper;
 use Moose;
 use WWW::Mechanize;
 use HTML::TreeBuilder::XPath;

 #ABSTRACT: Scrapes the site for parts

 has parts => (
 	is => 'rw',
 	isa => 'ArrayRef[HashRef]',
 	default => sub {[]},
 );

 has _mech => (
 	is => 'rw',
 	isa => 'WWW::Mechanize',
 	lazy_build => 1,
 );

 sub _build__mech {
 	return WWW::Mechanize->new;
 }

 sub BUILD {
 	my $self = shift;

 	$self->_mech->get( 'http://www.example.com/index.php' );

 	unless($self->_mech->success()) { die "Couldn't access login page, failed with status: " . $self->_mech->status(); }

 	$self->_mech->submit_form(
 		form_name => 'auth',
 		fields    => { username => 'myuser', userpass => 'mypass' },
 	);

 	unless($self->_mech->success()) { die "Couldn't submit login form, failed with status: " . $self->_mech->status(); }
 	unless($self->_mech->content =~ /Successful authentication/){ die "Didn't login successfully";	}

 	$self->_mech->follow_link( text_regex => qr/Continue/i ) ;
 	unless($self->_mech->success){ die "Couldn't continue" . $self->_mech->status(); }

 	my $link = $self->_mech->find_link( text_regex => qr/Espresso Parts by Manufacturer/i ) ;
 	$self->_mech->get($link);

 	my @manufacturers = $self->_extract_interesting_links;

 	foreach my $manufacturer (@manufacturers){
 		$self->_mech->get($manufacturer->{'uri'}) or die "Couldn't get category";
 		my @categories = $self->_extract_interesting_links;
 		foreach my $category (@categories) {
 			$self->_mech->get($category->{'uri'}) or die "Can't get product page";
 			warn("on this page: " . $category->{'uri'});

 			my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
 			my @product_nodes = $tree->findnodes( '//tr/td/font/a[@class=\'aalt4\']' );
 	
 			foreach my $product_node (@product_nodes){
 				my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
 				my @product_info_nodes = $tree->findnodes('//tr/td[@class=\'stdtext\'');

 				my %product_info;
 				foreach my $info (@product_info_nodes){		
 					$product_info{'name'} = $product_node->as_text,  
 					$product_info{'reference'} = ($info->as_text =~ /\(Reference\s+(\w+)\)/)[0] || 'N/A';

 					# use a . to catch pound symbol in order to avoid character encoding pain
 					$product_info{'price'} = ($info->as_text =~ /Price\s+.(\d+\.\d+)/)[0] || 0.00; 		

 					$product_info{'available'} = $info->as_text =~ /Available/ || 0.00;
 				}
 				push(@{$self->parts}, \%product_info);

 			}
 		}
 	}
 }


 sub _extract_interesting_links {
 	my $self = shift;
 	
 	my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content) 
 		or die "Couldn't initialise treebuilder";

 	my @nodes = $tree->findnodes( '//tr/td[@class=\'aalt4\']/a' );

 	my @links;
 	foreach (@nodes){

 		push(@links, 
 			{ name => $_->as_text, uri => $_->attr('href')}
 		);
 	}
 	
 	return @links;
 	
 }

 1;
	package EspressoParts::Scraper;
	use Moose;
	use WWW::Mechanize;
	use HTML::TreeBuilder::XPath;

	#ABSTRACT: Scrapes the site for parts

	has parts => (
	is => 'rw',
	isa => 'ArrayRef[HashRef]',
	default => sub {[]},
	);

	has _mech => (
	is => 'rw',
	isa => 'WWW::Mechanize',
	lazy_build => 1,
	);

	sub _build__mech {
	return WWW::Mechanize->new;
	}

	sub BUILD {
	my $self = shift;

	$self->_mech->get( 'http://www.example.com/index.php' );

	unless($self->_mech->success()) { die "Couldn't access login page, failed with status: " . $self->_mech->status(); }

	$self->_mech->submit_form(
	form_name => 'auth',
	fields => { username => 'myuser', userpass => 'mypass' },
	);

	unless($self->_mech->success()) { die "Couldn't submit login form, failed with status: " . $self->_mech->status(); }
	unless($self->_mech->content =~ /Successful authentication/){ die "Didn't login successfully"; }

	$self->_mech->follow_link( text_regex => qr/Continue/i ) ;
	unless($self->_mech->success){ die "Couldn't continue" . $self->_mech->status(); }

	my $link = $self->_mech->find_link( text_regex => qr/Espresso Parts by Manufacturer/i ) ;
	$self->_mech->get($link);

	my @manufacturers = $self->_extract_interesting_links;

	foreach my $manufacturer (@manufacturers){
	$self->_mech->get($manufacturer->{'uri'}) or die "Couldn't get category";
	my @categories = $self->_extract_interesting_links;
	foreach my $category (@categories) {
	$self->_mech->get($category->{'uri'}) or die "Can't get product page";
	warn("on this page: " . $category->{'uri'});

	my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
	my @product_nodes = $tree->findnodes( '//tr/td/font/a[@class=\'aalt4\']' );

	foreach my $product_node (@product_nodes){
	my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content);
	my @product_info_nodes = $tree->findnodes('//tr/td[@class=\'stdtext\'');

	my %product_info;
	foreach my $info (@product_info_nodes){
	$product_info{'name'} = $product_node->as_text,
	$product_info{'reference'} = ($info->as_text =~ /\(Reference\s+(\w+)\)/)[0] \|\| 'N/A';

	# use a . to catch pound symbol in order to avoid character encoding pain
	$product_info{'price'} = ($info->as_text =~ /Price\s+.(\d+\.\d+)/)[0] \|\| 0.00;

	$product_info{'available'} = $info->as_text =~ /Available/ \|\| 0.00;
	}
	push(@{$self->parts}, \%product_info);

	}
	}
	}
	}


	sub _extract_interesting_links {
	my $self = shift;

	my $tree = HTML::TreeBuilder::XPath->new_from_content($self->_mech->content)
	or die "Couldn't initialise treebuilder";

	my @nodes = $tree->findnodes( '//tr/td[@class=\'aalt4\']/a' );

	my @links;
	foreach (@nodes){

	push(@links,
	{ name => $_->as_text, uri => $_->attr('href')}
	);
	}

	return @links;

	}

	1;