Last active
September 4, 2015 03:28
-
-
Save mokjpn/ef93c3a54853f54c3550 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## modified from http://www.r-bloggers.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data/ | |
# Install XML and RCrul package and call library(XML) and library(RCurl) before use this. | |
searchPubMed <- function(query.term) { | |
# change spaces to + in query | |
query.gsub <- gsub(" ", "+", query.term) | |
# change single-quotes to URL-friendly %22 | |
query.gsub <- gsub("'","%22", query.gsub) | |
# Perform search and save history, this will save PMIDS in history | |
pub.esearch <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=", | |
query.gsub, "&usehistory=y", sep = "")) | |
# Parse esearch XML | |
pub.esearch <- xmlTreeParse(pub.esearch, asText = TRUE) | |
# Count number of hits (super assign) | |
pub.count <<- as.numeric(xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["Count"]])) | |
# Save WebEnv-string, it contains "links" to all articles in my search | |
pub.esearch <- xmlValue(pub.esearch[["doc"]][["eSearchResult"]][["WebEnv"]]) | |
# Show how many articles that's being downloaded | |
cat("Searching (downloading", pub.count, "articles)\n") | |
## We need to batch download, since efetch will cap at 10k articles ## | |
# Start at 0 | |
RetStart <- 0 | |
# End at 10k | |
RetMax <- 10000 | |
# Calculate how many itterations will be needed | |
Runs <- (pub.count %/% 10000) + 1 | |
# Create empty object | |
pub.efetch <- NULL | |
# Loop to batch download | |
for (i in 1:Runs) { | |
# Download XML based on hits saved in pub.esearch (WebEnv) | |
x <- getURL(paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=", | |
pub.esearch,"&query_key=1&retmode=xml&retstart=", RetStart, "&retmax=", RetMax, sep = "")) | |
# Add data to previous downloads | |
pub.efetch <- paste(pub.efetch, x, sep="") | |
# Increase range for next batch | |
RetStart <- RetStart + 10000 | |
RetMax <- RetMax + 10000 | |
} | |
# Print that download is completed | |
cat("Completed download from PubMed.\n") | |
# Return XML | |
return(xmlTreeParse(pub.efetch, useInternalNodes = TRUE, asText=TRUE)) | |
} | |
Count1st <- function(xmldata, matchstr) { | |
return(unlist(xpathSApply(xmldata, "//PubmedArticle/MedlineCitation" , function(node) { | |
if(length( grep(matchstr, xmlValue(getNodeSet(node, "./Article/AuthorList/Author/AffiliationInfo")[[1]]), ignore.case=TRUE) ) != 0 ) | |
return(xmlValue(getNodeSet(node, "./PMID")[[1]])) | |
}) | |
))} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment