Created
November 23, 2013 14:36
-
-
Save spikeheap/7615234 to your computer and use it in GitHub Desktop.
A scraper to retrieve the NICE pathways in XML format, for research purposes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use warnings; | |
use strict; | |
use LWP::UserAgent; | |
use File::Basename; | |
use Time::HiRes qw(usleep); | |
use CGI qw/escape/; | |
# Create an LWP User-Agent object for sending HTTP requests. | |
my $ua = LWP::UserAgent->new; | |
my $root_url = "http://pathways.nice.org.uk/"; | |
my $outputDir = "output"; | |
mkdir $outputDir; | |
my $content = getContent($root_url); | |
my $processedURLs = {}; | |
#print "$content\n"; | |
my @matches = $content =~ /<a href=["'](\/pathways\/.*?)["']/g; | |
for my $match (@matches){ | |
print "Base URL: $match\n"; | |
getXMLFileAndProcessChildren($match); | |
} | |
sub getXMLFileAndProcessChildren{ | |
my $url = $_[0]; | |
# Only handle the URL once | |
if(!defined $processedURLs->{$url}){ | |
$processedURLs->{$url} = 1; | |
my $pageContent = getContent($root_url.$url); | |
my @xmlFiles = $pageContent =~ /["']([a-zA-Z0-9_\/-]*?.xml)["']/g; | |
for my $xmlFile (@xmlFiles){ | |
my $xmlFileName = "$root_url/$xmlFile"; | |
if(!defined $processedURLs->{$xmlFileName}){ | |
$processedURLs->{$xmlFileName} = 1; | |
my $xmlContent = getContent($xmlFileName); | |
my $outputFileName = basename($xmlFileName); | |
print "\t$outputFileName\n"; | |
# Dump the output to a file | |
open (XMLOUTPUT, ">>$outputDir/$outputFileName"); | |
print XMLOUTPUT $xmlContent; | |
close (XMLOUTPUT); | |
# And then let's look for more content to slurp! | |
my @nestedXMLFiles = $xmlContent =~ /["']([a-zA-Z0-9_\/-]*?.xml)["']/g; | |
for my $nestedFile (@nestedXMLFiles){ | |
#print "\t\t$nestedFile\n"; | |
getXMLFileAndProcessChildren($nestedFile); | |
} | |
} | |
} | |
}else{ | |
#print "\t\tSkipping URL $url\n"; | |
} | |
} | |
sub getContent{ | |
my $urlToGet = $_[0]; | |
# Let's be nice and wait for a small amount of time before each request, so we don't hammer the web server | |
my $waitTimeMillis = rand()*10000; # between 0 and 10 seconds | |
usleep(1000 * $waitTimeMillis); | |
# Create an HTTP request object for this URL. | |
my $request = HTTP::Request->new(GET => $urlToGet); | |
# This HTTP header is required. The server outputs garbage if | |
# it's not present. | |
$request->push_header('Content-Type' => 'text/html'); | |
# Send the request and check for an error from the server. | |
my $response = $ua->request($request); | |
print "\t\tError ".$response->code.": $urlToGet\n" if !$response->is_success; | |
return $response->content | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment