Created
February 13, 2014 21:27
-
-
Save ericleasemorgan/8984187 to your computer and use it in GitHub Desktop.
given a (CrossRef) DOI, parse link header of HTTP request to get fulltext URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sub extracter { | |
# given a (CrossRef) DOI, parse link header of HTTP request to get fulltext URLs | |
# see also: https://prospect.crossref.org/splash/ | |
# Eric Lease Morgan <[email protected]> | |
# February 12, 2014 - first cut | |
# require | |
use HTTP::Request; | |
use LWP::UserAgent; | |
# get input | |
my $doi = shift; | |
# initialize | |
my %links = (); | |
my $pdf = ''; | |
my $xml = ''; | |
my $generic = ''; | |
# build an HTTP request and send it; redirection comes for free | |
my $request = HTTP::Request->new( GET => $doi ); | |
$request->header( Accept => "text/turtle" ); | |
my $user_agent = LWP::UserAgent->new; | |
my $response = $user_agent->request( $request ); | |
# process each link in the links header | |
foreach my $link ( split /, /, $response->header( 'link' ) ) { | |
# (re-)initialize | |
my $pointer = 0; | |
my %attributes = (); | |
# process each part of a link | |
foreach my $part ( split /;/, $link ) { | |
# remove extra spaces | |
$part =~ s/^ +//; | |
$part =~ s/ +$//; | |
# increment | |
$pointer++; | |
# skip the first part, the data URL | |
next if ( $pointer == 1 ); | |
# process subsequent parts | |
my ( $name, $value ) = split /=/, $part; | |
$value =~ s/^\W+//; | |
$value =~ s/\W+$//; | |
$attributes{ $name } = $value; | |
} | |
# associate mime types with urls | |
if ( $attributes{ 'type' } eq 'application/pdf' ) { $pdf = $attributes{ 'anchor' } } | |
elsif ( $attributes{ 'type' } eq 'application/xml' ) { $xml = $attributes{ 'anchor' } } | |
elsif ( ! $attributes{ 'type' } ) { $generic = $attributes{ 'anchor' } } | |
} | |
# stuff the result into a data structure | |
$links{ $doi } = { 'pdf' => $pdf, 'xml' => $xml, 'generic' => $generic }; | |
# done | |
return \%links; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment