Skip to content

Instantly share code, notes, and snippets.

@aurora-mareviv
Last active September 22, 2016 23:10
Show Gist options
  • Save aurora-mareviv/64e711b5516355f82fb8 to your computer and use it in GitHub Desktop.
Save aurora-mareviv/64e711b5516355f82fb8 to your computer and use it in GitHub Desktop.
PMIDs database, semi-automatized
#########################################################
#### CAPTURE ARTICLE INFO, IMPACT FACTORS FROM PMIDs ####
#########################################################
# Script to retrieve and organise publications data from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.
# First, automagically install needed libraries:
list.of.packages <- c("RCurl", "RefManageR", "devtools", "plyr", "XML", "data.table")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')
# 1. # Import PMIDs
library(RCurl)
pmids <- read.csv("http://atriumkm.idisantiago.es/bin/ICT/PMIDs?xpage=plain&outputSyntax=plain")
# head(pmids)
# urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
# pmids <- read.csv(textConnection(urli))
# 2. Loop several queries to PubMed and return in a data.frame
index <- pmids$pmId[1:length(pmids$pmId)] #vector
index10 <- pmids$pmId[1:10] #vector
library(RefManageR)
# my modified ReadPubMed function; it removes both "language" and "DOI" fields, to make easier to fit results in a dataframe.
source_url("https://gist.github.com/aurora-mareviv/8d5ee4c34d27cf084165/raw/4e66503e14ab767075fd743589ea797c01becd51/ReadPubMed.R") # workaround to source_gist error/bug?
library(devtools)
library(plyr)
library(XML)
auth.pm <- ldply(index, function(x){
tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
data.frame(tmp, stringsAsFactors = FALSE)
})
# 3. Change weird symbols to make possible journal correspondences:
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ñ",replacement="n"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="í",replacement="i"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ä­",replacement="i"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="é",replacement="e"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="á",replacement="a"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ã",replacement="a"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ä",replacement="a"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="â",replacement="a"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ó",replacement="o"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ò",replacement="o"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ú",replacement="u"))
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ü",replacement="u"))
# 4. Capture Impact factor and Journal abbreviations
# Import WOS data -- SEE THE SCRIPT: ifactors2014.R
source_gist("https://gist.github.com/aurora-mareviv/459401dc782e5f054767") # ifactors database
# Import journal abbreviations -- SEE THE SCRIPT: journaltab.R
source_gist("https://gist.github.com/aurora-mareviv/7680180cbe0283f87854") # journaltab database
# Capture journal abbreviations
dum <- merge(auth.pm, journaltab,by="journal", all.x=TRUE,ignore.case=TRUE)
# Refill empty ISSN.Print cells with info in ISSN.Online
dum$ISSN.Print <- as.character(dum$ISSN.Print)
dum$ISSN.Online <- as.character(dum$ISSN.Online)
dum$ISSN.Print.Online <- ifelse(dum$ISSN.Print=="",dum$ISSN.Online,dum$ISSN.Print) # Only ISSN.Online if Print is empty
dum$ISSN.Online.Print <- ifelse(dum$ISSN.Online=="",dum$ISSN.Print,dum$ISSN.Online) # Only ISSN.Print if Online is empty
dum$ISSN <- dum$ISSN.Print.Online # our preference to search for IFs: match with ISSN.Print, and Online if Print is empty
# Merge with impact factors:
auth.if <- merge(dum, ifactors,by="ISSN", all.x=TRUE)
# Correct ISSN to capture IFs driven by ISSN.Online:
auth.if$ISSN <- ifelse(is.na(auth.if$journal.bad), auth.if$ISSN.Online.Print, auth.if$ISSN)
# Re-merge with ifactors. Now we will have some junk-variables... but we capture as many IFs as we can!
auth.pmif <- merge(auth.if, ifactors, by="ISSN", all.x=TRUE) # The field we are searching for is "auth.pmif$2013.2014.y"
# 5. Split auth.pmif$author in the authors:
library(data.table)
source_gist("https://gist.github.com/mrdwab/11380733") # function cSplit
auth.pmiff <- cSplit(auth.pmif, "author", ",")
# 6. Display data:
View(auth.pmiff)
auth.pmiff$X2013.2014.y <- as.integer(auth.pmiff$X2013.2014.y)
mean(auth.pmiff$X2013.2014.y, na.rm = TRUE)
hist(auth.pmiff$X2013.2014.y, breaks=60)
# 7. Write output to .csv
#write.csv(auth.pm, file = "auth.pm.csv")
#write.csv(auth.pmif, file = "auth.pmif.csv")
#write.csv(auth.pmiff, file = "auth.pmiff.csv")
@aurora-mareviv
Copy link
Author

Para que este script funcione correctamente, es necesario que los 3 ficheros fuente (lista de PMIDs pmids, tabla de Abreviaturas journaltab y tabla de Factores de impacto ifactors) estén con los datos correctos y el formato adecuado.
Los links de los que se descargan estos datos (NCBI, CiteFactor) pueden cambiar.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment