Last active
September 22, 2016 23:10
-
-
Save aurora-mareviv/64e711b5516355f82fb8 to your computer and use it in GitHub Desktop.
PMIDs database, semi-automatized
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################### | |
#### CAPTURE ARTICLE INFO, IMPACT FACTORS FROM PMIDs #### | |
######################################################### | |
# Script to retrieve and organise publications data from PubMed (http://www.ncbi.nlm.nih.gov/pubmed) | |
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page. | |
# First, automagically install needed libraries: | |
list.of.packages <- c("RCurl", "RefManageR", "devtools", "plyr", "XML", "data.table") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/') | |
# 1. # Import PMIDs | |
library(RCurl) | |
pmids <- read.csv("http://atriumkm.idisantiago.es/bin/ICT/PMIDs?xpage=plain&outputSyntax=plain") | |
# head(pmids) | |
# urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE) | |
# pmids <- read.csv(textConnection(urli)) | |
# 2. Loop several queries to PubMed and return in a data.frame | |
index <- pmids$pmId[1:length(pmids$pmId)] #vector | |
index10 <- pmids$pmId[1:10] #vector | |
library(RefManageR) | |
# my modified ReadPubMed function; it removes both "language" and "DOI" fields, to make easier to fit results in a dataframe. | |
source_url("https://gist.github.com/aurora-mareviv/8d5ee4c34d27cf084165/raw/4e66503e14ab767075fd743589ea797c01becd51/ReadPubMed.R") # workaround to source_gist error/bug? | |
library(devtools) | |
library(plyr) | |
library(XML) | |
auth.pm <- ldply(index, function(x){ | |
tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950)) | |
tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z) | |
data.frame(tmp, stringsAsFactors = FALSE) | |
}) | |
# 3. Change weird symbols to make possible journal correspondences: | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ñ",replacement="n")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã",replacement="i")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ä",replacement="i")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="é",replacement="e")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="á",replacement="a")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ã",replacement="a")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ä",replacement="a")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="â",replacement="a")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ó",replacement="o")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ò",replacement="o")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ú",replacement="u")) | |
auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="ü",replacement="u")) | |
# 4. Capture Impact factor and Journal abbreviations | |
# Import WOS data -- SEE THE SCRIPT: ifactors2014.R | |
source_gist("https://gist.github.com/aurora-mareviv/459401dc782e5f054767") # ifactors database | |
# Import journal abbreviations -- SEE THE SCRIPT: journaltab.R | |
source_gist("https://gist.github.com/aurora-mareviv/7680180cbe0283f87854") # journaltab database | |
# Capture journal abbreviations | |
dum <- merge(auth.pm, journaltab,by="journal", all.x=TRUE,ignore.case=TRUE) | |
# Refill empty ISSN.Print cells with info in ISSN.Online | |
dum$ISSN.Print <- as.character(dum$ISSN.Print) | |
dum$ISSN.Online <- as.character(dum$ISSN.Online) | |
dum$ISSN.Print.Online <- ifelse(dum$ISSN.Print=="",dum$ISSN.Online,dum$ISSN.Print) # Only ISSN.Online if Print is empty | |
dum$ISSN.Online.Print <- ifelse(dum$ISSN.Online=="",dum$ISSN.Print,dum$ISSN.Online) # Only ISSN.Print if Online is empty | |
dum$ISSN <- dum$ISSN.Print.Online # our preference to search for IFs: match with ISSN.Print, and Online if Print is empty | |
# Merge with impact factors: | |
auth.if <- merge(dum, ifactors,by="ISSN", all.x=TRUE) | |
# Correct ISSN to capture IFs driven by ISSN.Online: | |
auth.if$ISSN <- ifelse(is.na(auth.if$journal.bad), auth.if$ISSN.Online.Print, auth.if$ISSN) | |
# Re-merge with ifactors. Now we will have some junk-variables... but we capture as many IFs as we can! | |
auth.pmif <- merge(auth.if, ifactors, by="ISSN", all.x=TRUE) # The field we are searching for is "auth.pmif$2013.2014.y" | |
# 5. Split auth.pmif$author in the authors: | |
library(data.table) | |
source_gist("https://gist.github.com/mrdwab/11380733") # function cSplit | |
auth.pmiff <- cSplit(auth.pmif, "author", ",") | |
# 6. Display data: | |
View(auth.pmiff) | |
auth.pmiff$X2013.2014.y <- as.integer(auth.pmiff$X2013.2014.y) | |
mean(auth.pmiff$X2013.2014.y, na.rm = TRUE) | |
hist(auth.pmiff$X2013.2014.y, breaks=60) | |
# 7. Write output to .csv | |
#write.csv(auth.pm, file = "auth.pm.csv") | |
#write.csv(auth.pmif, file = "auth.pmif.csv") | |
#write.csv(auth.pmiff, file = "auth.pmiff.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Para que este script funcione correctamente, es necesario que los 3 ficheros fuente (lista de PMIDs pmids, tabla de Abreviaturas journaltab y tabla de Factores de impacto ifactors) estén con los datos correctos y el formato adecuado.
Los links de los que se descargan estos datos (NCBI, CiteFactor) pueden cambiar.