Created
January 16, 2013 21:57
-
-
Save kanzure/4551321 to your computer and use it in GitHub Desktop.
sciencemag.org downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Scraper for Science Magazine | |
# Needs curl and grep and mkdir | |
# Just use it on Linux | |
# Downloads paper and all available supplementary data | |
# including movies and tables | |
# Also downloads Table of Contents for each issue | |
# Should store all the data properly so one can browse from | |
# each TOC | |
#issue | |
$x = 6116; | |
#volume | |
$y = 339; | |
for(;$x >= 1; $x--) | |
{ | |
#let us not DDoS them | |
sleep(5); | |
$url = "http://www.sciencemag.org/content/$y/$x.toc"; | |
$name = "$x\.toc"; | |
`curl -s --retry 3 -A GoogleBot $url -o $name`; | |
# Last issue in volume goto next volume | |
# probably could be done more elegantly | |
if(`grep "Content not found" $name`) | |
{ | |
$y--; | |
$url = "http://www.sciencemag.org/content/$y/$x.toc"; | |
$name = "$x\.$y"; | |
`curl -s --retry 3 -A GoogleBot $url -o $name`; | |
} | |
@supp = `grep -B 1 "Supporting Online Material" $name`; | |
@supp1 = `grep suppl $name`; | |
push(@supp, @supp1); | |
@stuff = `grep "Full Text (PDF)" $name`; | |
`mkdir $y`; | |
`mkdir $y/$x`; | |
#time to parse webpages to find the papers and files | |
foreach $mag (@stuff) | |
{ | |
@file = split(/\"/,$mag); | |
if(grep(/last/,$mag)) { $bob = "$file[3]"; } | |
else{ | |
#print $bob = "$file[1]"; | |
} | |
$name = "http://www.sciencemag.org$bob"; | |
@final = split(/\//,$bob); | |
`curl -s -A GoogleBot $name -o $y/$x/$final[4]`; | |
} | |
#Find supplemental materials | |
foreach $sup (@supp) | |
{ | |
@file = split(/\"/,$sup); | |
if(grep(/last/,$sup)) { $bob = "$file[3]"; } | |
else{ $bob = "$file[1]"; } | |
@dirs = split(/\//, $bob); | |
`mkdir ./$dirs[1]/$dirs[2]`; | |
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]`; | |
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]`; | |
`mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]/$dirs[5]`; | |
$name = "http://www.sciencemag.org$bob"; | |
`curl -s -A GoogleBot $name -o .$bob`; | |
@morestuff = `grep "Download Supplement" .$bob`; | |
@movies = `grep "Movie S" .$bob`; | |
@tables = `grep "Table S" .$bob`; | |
foreach $more (@morestuff) | |
{ | |
@file = split(/\"/,$more); | |
$bob = "$file[3]"; | |
if(grep(/science/, $bob)) {@final = split(/\//,$bob);$finalname = "$final[7]"; } | |
else { @final = split(/\./,$bob); $finalname = "Supp\.$final[2]\.pdf"; } | |
$name = "http://www.sciencemag.org$bob"; | |
`curl -s -A GoogleBot $name -o $y/$x/$finalname`; | |
} | |
foreach $mov (@movies) | |
{ | |
@file = split(/\"/,$mov); | |
$bob = "$file[3]"; | |
@final = split(/\//,$bob);$finalname = "$final[7]"; | |
`mkdir ./$final[1]/$final[2]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`; | |
$name = "http://www.sciencemag.org$bob"; | |
`curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`; | |
} | |
foreach $tab (@tables) | |
{ | |
@file = split(/\"/,$tab); | |
$bob = "$file[3]"; | |
@final = split(/\//,$bob);$finalname = "$final[7]"; | |
`mkdir ./$final[1]/$final[2]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`; | |
`mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`; | |
$name = "http://www.sciencemag.org$bob"; | |
`curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment