aurora-mareviv · March 11, 2021 13:29 · aurora-mareviv · Oct 6, 2016
diff --git a/pmid.tagcloud.R b/pmid.tagcloud.R
 #########################################################
 #### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD ####
 #########################################################

 # GNU-GPL license
 # Author: Mareviv (https://talesofr.wordpress.com)

 # Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
 # Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.
 # This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot.

 # First, automagically install needed libraries:
 list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'.
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary")

 list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC")
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')

 # Get and store the working directory
 wdir <- getwd()

            
 # 1. Import PMIDs
 message("retrieving PMIDs info...")
 library(RCurl)
 urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
 pmids <- read.csv(textConnection(urli))
 message("PMID info succesfully retrieved")
            
            
 # 2. Loop several queries to PubMed and return in a data.frame
 index <- pmids$pmId[1:length(pmids$pmId)]  
 # The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test: 
 index50 <- pmids$pmId[1:50]

 library(RefManageR)
 library(plyr)
 message("connecting to the free PubMed API...")
 auth.pm <- ldply(index50, function(x){
            tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
            tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
            data.frame(tmp, stringsAsFactors = FALSE)
           })
 message("abstract data successfully downloaded!")

                        
 # 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!)       
 corpus.dir <- paste(wdir, "corpus1", sep="/")
 message(paste("creating new directory: ", corpus.dir, sep=""))
 dir.create(corpus.dir)
 setwd(corpus.dir)

                        
 # 4. Extract abstracts to a .txt
 text <- paste(auth.pm$abstract)
 message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep=""))
 writeLines(text, "abstracts.txt")

                        
 # 5. Create tagcloud
 library(tm)
 library(wordcloud)
 library(SnowballC)

 message("constructing the tagcloud...")
 abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory
 abstract <- tm_map(abstract, stripWhitespace) # transformations
 abstract <- tm_map(abstract, content_transformer(tolower))
 abstract <- tm_map(abstract, removeWords, stopwords("english"))
 # abstract <- tm_map(abstract, stemDocument) # optional in this case
 abstract <- tm_map(abstract, removeNumbers) # optional in this case
 abstract <- tm_map(abstract, removePunctuation)
 # tuning
 abstract <- tm_map(abstract, removeWords, "methods")
 abstract <- tm_map(abstract, removeWords, "results")
 abstract <- tm_map(abstract, removeWords, "conclusions")
 abstract <- tm_map(abstract, removeWords, "conclusion")
 abstract <- tm_map(abstract, removeWords, "whether")
 abstract <- tm_map(abstract, removeWords, "due")


 # 6. Print image in a new folder: wordcloud
 plot.dir <- paste(wdir, "wordcloud", sep="/")
 message(paste("creating new directory: ", plot.dir, sep=""))
 dir.create(plot.dir)
 setwd(plot.dir)
 message(paste("printing file: ", plot.dir, "/wordcloud.png", sep=""))
 png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent")
 wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
 dev.off()


 # 7. Reset the working directory 
 setwd(wdir)
	#########################################################
	#### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD ####
	#########################################################

	# GNU-GPL license
	# Author: Mareviv (https://talesofr.wordpress.com)

	# Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed)
	# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page.
	# This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot.

	# First, automagically install needed libraries:
	list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'.
	new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
	if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary")

	list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC")
	new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
	if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/')

	# Get and store the working directory
	wdir <- getwd()


	# 1. Import PMIDs
	message("retrieving PMIDs info...")
	library(RCurl)
	urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE)
	pmids <- read.csv(textConnection(urli))
	message("PMID info succesfully retrieved")


	# 2. Loop several queries to PubMed and return in a data.frame
	index <- pmids$pmId[1:length(pmids$pmId)]
	# The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test:
	index50 <- pmids$pmId[1:50]

	library(RefManageR)
	library(plyr)
	message("connecting to the free PubMed API...")
	auth.pm <- ldply(index50, function(x){
	tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950))
	tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z)
	data.frame(tmp, stringsAsFactors = FALSE)
	})
	message("abstract data successfully downloaded!")


	# 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!)
	corpus.dir <- paste(wdir, "corpus1", sep="/")
	message(paste("creating new directory: ", corpus.dir, sep=""))
	dir.create(corpus.dir)
	setwd(corpus.dir)


	# 4. Extract abstracts to a .txt
	text <- paste(auth.pm$abstract)
	message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep=""))
	writeLines(text, "abstracts.txt")


	# 5. Create tagcloud
	library(tm)
	library(wordcloud)
	library(SnowballC)

	message("constructing the tagcloud...")
	abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory
	abstract <- tm_map(abstract, stripWhitespace) # transformations
	abstract <- tm_map(abstract, content_transformer(tolower))
	abstract <- tm_map(abstract, removeWords, stopwords("english"))
	# abstract <- tm_map(abstract, stemDocument) # optional in this case
	abstract <- tm_map(abstract, removeNumbers) # optional in this case
	abstract <- tm_map(abstract, removePunctuation)
	# tuning
	abstract <- tm_map(abstract, removeWords, "methods")
	abstract <- tm_map(abstract, removeWords, "results")
	abstract <- tm_map(abstract, removeWords, "conclusions")
	abstract <- tm_map(abstract, removeWords, "conclusion")
	abstract <- tm_map(abstract, removeWords, "whether")
	abstract <- tm_map(abstract, removeWords, "due")


	# 6. Print image in a new folder: wordcloud
	plot.dir <- paste(wdir, "wordcloud", sep="/")
	message(paste("creating new directory: ", plot.dir, sep=""))
	dir.create(plot.dir)
	setwd(plot.dir)
	message(paste("printing file: ", plot.dir, "/wordcloud.png", sep=""))
	png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent")
	wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
	dev.off()


	# 7. Reset the working directory
	setwd(wdir)