aurora-mareviv · September 22, 2016 23:10 · aurora-mareviv · Oct 17, 2014
diff --git a/pmidata.2.R b/pmidata.2.R
 #########################################################
 #### CAPTURE ARTICLE INFO, IMPACT FACTORS FROM PMIDs ####
 #########################################################

 # Script to retrieve and organise publications data from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
 # Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.

 # First, automagically install needed libraries:
 list.of.packages <- c("RCurl", "RefManageR", "devtools", "plyr", "XML", "data.table")
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')

 # 1. # Import PMIDs
 library(RCurl)
 pmids <- read.csv("http://atriumkm.idisantiago.es/bin/ICT/PMIDs?xpage=plain&outputSyntax=plain")
 # head(pmids)
 # urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
 # pmids <- read.csv(textConnection(urli))

 # 2. Loop several queries to PubMed and return in a data.frame
 index <- pmids$pmId[1:length(pmids$pmId)]  #vector
 index10 <- pmids$pmId[1:10]  #vector

 library(RefManageR)
 # my modified ReadPubMed function; it removes both "language" and "DOI" fields, to make easier to fit results in a dataframe.
 source_url("https://gist.github.com/aurora-mareviv/8d5ee4c34d27cf084165/raw/4e66503e14ab767075fd743589ea797c01becd51/ReadPubMed.R") # workaround to source_gist error/bug?
 library(devtools)
 library(plyr)
 library(XML)
 auth.pm <- ldply(index, function(x){
            tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
            tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
            data.frame(tmp, stringsAsFactors = FALSE)
           })


 # 3. Change weird symbols to make possible journal correspondences:
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã±",replacement="n"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã",replacement="i"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ä",replacement="i"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã©",replacement="e"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¡",replacement="a"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã£",replacement="a"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¤",replacement="a"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¢",replacement="a"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã³",replacement="o"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã²",replacement="o"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ãº",replacement="u"))
 auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¼",replacement="u"))


 # 4. Capture Impact factor and Journal abbreviations
 # Import WOS data -- SEE THE SCRIPT: ifactors2014.R
 source_gist("https://gist.github.com/aurora-mareviv/459401dc782e5f054767") # ifactors database
 # Import journal abbreviations --  SEE THE SCRIPT: journaltab.R
 source_gist("https://gist.github.com/aurora-mareviv/7680180cbe0283f87854") # journaltab database

 # Capture journal abbreviations
 dum <- merge(auth.pm, journaltab,by="journal", all.x=TRUE,ignore.case=TRUE)
 # Refill empty ISSN.Print cells with info in ISSN.Online
 dum$ISSN.Print <- as.character(dum$ISSN.Print)
 dum$ISSN.Online <- as.character(dum$ISSN.Online)

 dum$ISSN.Print.Online <- ifelse(dum$ISSN.Print=="",dum$ISSN.Online,dum$ISSN.Print) # Only ISSN.Online if Print is empty
 dum$ISSN.Online.Print <- ifelse(dum$ISSN.Online=="",dum$ISSN.Print,dum$ISSN.Online) # Only ISSN.Print if Online is empty
 dum$ISSN <- dum$ISSN.Print.Online # our preference to search for IFs: match with ISSN.Print, and Online if Print is empty

 # Merge with impact factors:
 auth.if <- merge(dum, ifactors,by="ISSN", all.x=TRUE)

 # Correct ISSN to capture IFs driven by ISSN.Online:
 auth.if$ISSN <- ifelse(is.na(auth.if$journal.bad), auth.if$ISSN.Online.Print, auth.if$ISSN)

 # Re-merge with ifactors. Now we will have some junk-variables... but we capture as many IFs as we can!
 auth.pmif <- merge(auth.if, ifactors, by="ISSN", all.x=TRUE)  # The field we are searching for is "auth.pmif$2013.2014.y"


 # 5. Split auth.pmif$author in the authors:
 library(data.table)
 source_gist("https://gist.github.com/mrdwab/11380733") # function cSplit
 auth.pmiff <- cSplit(auth.pmif, "author", ",")


 # 6. Display data:
 View(auth.pmiff)
 auth.pmiff$X2013.2014.y <- as.integer(auth.pmiff$X2013.2014.y)
 mean(auth.pmiff$X2013.2014.y, na.rm = TRUE)
 hist(auth.pmiff$X2013.2014.y, breaks=60)


 # 7. Write output to .csv
 #write.csv(auth.pm, file = "auth.pm.csv")
 #write.csv(auth.pmif, file = "auth.pmif.csv")
 #write.csv(auth.pmiff, file = "auth.pmiff.csv")
	#########################################################
	#### CAPTURE ARTICLE INFO, IMPACT FACTORS FROM PMIDs ####
	#########################################################

	# Script to retrieve and organise publications data from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
	# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.

	# First, automagically install needed libraries:
	list.of.packages <- c("RCurl", "RefManageR", "devtools", "plyr", "XML", "data.table")
	new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
	if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')

	# 1. # Import PMIDs
	library(RCurl)
	pmids <- read.csv("http://atriumkm.idisantiago.es/bin/ICT/PMIDs?xpage=plain&outputSyntax=plain")
	# head(pmids)
	# urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
	# pmids <- read.csv(textConnection(urli))

	# 2. Loop several queries to PubMed and return in a data.frame
	index <- pmids$pmId[1:length(pmids$pmId)] #vector
	index10 <- pmids$pmId[1:10] #vector

	library(RefManageR)
	# my modified ReadPubMed function; it removes both "language" and "DOI" fields, to make easier to fit results in a dataframe.
	source_url("https://gist.github.com/aurora-mareviv/8d5ee4c34d27cf084165/raw/4e66503e14ab767075fd743589ea797c01becd51/ReadPubMed.R") # workaround to source_gist error/bug?
	library(devtools)
	library(plyr)
	library(XML)
	auth.pm <- ldply(index, function(x){
	tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
	tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
	data.frame(tmp, stringsAsFactors = FALSE)
	})


	# 3. Change weird symbols to make possible journal correspondences:
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã±",replacement="n"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã",replacement="i"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ä",replacement="i"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã©",replacement="e"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¡",replacement="a"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã£",replacement="a"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¤",replacement="a"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¢",replacement="a"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã³",replacement="o"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã²",replacement="o"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ãº",replacement="u"))
	auth.pm <- as.data.frame(sapply(auth.pm,gsub,pattern="Ã¼",replacement="u"))


	# 4. Capture Impact factor and Journal abbreviations
	# Import WOS data -- SEE THE SCRIPT: ifactors2014.R
	source_gist("https://gist.github.com/aurora-mareviv/459401dc782e5f054767") # ifactors database
	# Import journal abbreviations -- SEE THE SCRIPT: journaltab.R
	source_gist("https://gist.github.com/aurora-mareviv/7680180cbe0283f87854") # journaltab database

	# Capture journal abbreviations
	dum <- merge(auth.pm, journaltab,by="journal", all.x=TRUE,ignore.case=TRUE)
	# Refill empty ISSN.Print cells with info in ISSN.Online
	dum$ISSN.Print <- as.character(dum$ISSN.Print)
	dum$ISSN.Online <- as.character(dum$ISSN.Online)

	dum$ISSN.Print.Online <- ifelse(dum$ISSN.Print=="",dum$ISSN.Online,dum$ISSN.Print) # Only ISSN.Online if Print is empty
	dum$ISSN.Online.Print <- ifelse(dum$ISSN.Online=="",dum$ISSN.Print,dum$ISSN.Online) # Only ISSN.Print if Online is empty
	dum$ISSN <- dum$ISSN.Print.Online # our preference to search for IFs: match with ISSN.Print, and Online if Print is empty

	# Merge with impact factors:
	auth.if <- merge(dum, ifactors,by="ISSN", all.x=TRUE)

	# Correct ISSN to capture IFs driven by ISSN.Online:
	auth.if$ISSN <- ifelse(is.na(auth.if$journal.bad), auth.if$ISSN.Online.Print, auth.if$ISSN)

	# Re-merge with ifactors. Now we will have some junk-variables... but we capture as many IFs as we can!
	auth.pmif <- merge(auth.if, ifactors, by="ISSN", all.x=TRUE) # The field we are searching for is "auth.pmif$2013.2014.y"


	# 5. Split auth.pmif$author in the authors:
	library(data.table)
	source_gist("https://gist.github.com/mrdwab/11380733") # function cSplit
	auth.pmiff <- cSplit(auth.pmif, "author", ",")


	# 6. Display data:
	View(auth.pmiff)
	auth.pmiff$X2013.2014.y <- as.integer(auth.pmiff$X2013.2014.y)
	mean(auth.pmiff$X2013.2014.y, na.rm = TRUE)
	hist(auth.pmiff$X2013.2014.y, breaks=60)


	# 7. Write output to .csv
	#write.csv(auth.pm, file = "auth.pm.csv")
	#write.csv(auth.pmif, file = "auth.pmif.csv")
	#write.csv(auth.pmiff, file = "auth.pmiff.csv")