Created
March 30, 2014 21:33
-
-
Save cpsievert/9880212 to your computer and use it in GitHub Desktop.
Obtain all abstracts from DOIs scraped off of -- https://github.com/elifesciences/elife-articles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(RCurl) | |
library(stringr) | |
library(elife) | |
# Obtain all the dois! Well, at least every one on GitHub. | |
# Should/could this be an option in searchelife? It doesn't take that long... | |
con <- getURL("https://github.com/elifesciences/elife-articles") | |
doc <- htmlParse(con, asText = TRUE) | |
nodes <- getNodeSet(doc, path="//a[@class='js-directory-link']") | |
files <- sapply(nodes, xmlValue) | |
files <- files[grep("elife[0-9]+.xml", files)] | |
nums <- str_extract(files, "[0-9]+") | |
dois <- paste0("10.7554/eLife.", nums) | |
# Passing all these dois at once to elife_doi returns error... | |
# For now, just query in chunks | |
start <- seq(1, length(dois), by = 100) | |
end <- start - 1 | |
end <- c(end[-1], length(dois)) | |
idx <- cbind(start, end) | |
abs <- NULL | |
for (i in seq_len(dim(idx)[1])) { | |
ab <- elife_doi(dois[idx[i,1]:idx[i,2]], ret = "abstract") | |
abs <- c(abs, ab) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment