Last active
March 11, 2021 13:29
-
-
Save aurora-mareviv/697cbb505189591648224ed640e70fb1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################### | |
#### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD #### | |
######################################################### | |
# GNU-GPL license | |
# Author: Mareviv (https://talesofr.wordpress.com) | |
# Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed) | |
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page. | |
# This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot. | |
# First, automagically install needed libraries: | |
list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'. | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary") | |
list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/') | |
# Get and store the working directory | |
wdir <- getwd() | |
# 1. Import PMIDs | |
message("retrieving PMIDs info...") | |
library(RCurl) | |
urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE) | |
pmids <- read.csv(textConnection(urli)) | |
message("PMID info succesfully retrieved") | |
# 2. Loop several queries to PubMed and return in a data.frame | |
index <- pmids$pmId[1:length(pmids$pmId)] | |
# The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test: | |
index50 <- pmids$pmId[1:50] | |
library(RefManageR) | |
library(plyr) | |
message("connecting to the free PubMed API...") | |
auth.pm <- ldply(index50, function(x){ | |
tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950)) | |
tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z) | |
data.frame(tmp, stringsAsFactors = FALSE) | |
}) | |
message("abstract data successfully downloaded!") | |
# 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!) | |
corpus.dir <- paste(wdir, "corpus1", sep="/") | |
message(paste("creating new directory: ", corpus.dir, sep="")) | |
dir.create(corpus.dir) | |
setwd(corpus.dir) | |
# 4. Extract abstracts to a .txt | |
text <- paste(auth.pm$abstract) | |
message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep="")) | |
writeLines(text, "abstracts.txt") | |
# 5. Create tagcloud | |
library(tm) | |
library(wordcloud) | |
library(SnowballC) | |
message("constructing the tagcloud...") | |
abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory | |
abstract <- tm_map(abstract, stripWhitespace) # transformations | |
abstract <- tm_map(abstract, content_transformer(tolower)) | |
abstract <- tm_map(abstract, removeWords, stopwords("english")) | |
# abstract <- tm_map(abstract, stemDocument) # optional in this case | |
abstract <- tm_map(abstract, removeNumbers) # optional in this case | |
abstract <- tm_map(abstract, removePunctuation) | |
# tuning | |
abstract <- tm_map(abstract, removeWords, "methods") | |
abstract <- tm_map(abstract, removeWords, "results") | |
abstract <- tm_map(abstract, removeWords, "conclusions") | |
abstract <- tm_map(abstract, removeWords, "conclusion") | |
abstract <- tm_map(abstract, removeWords, "whether") | |
abstract <- tm_map(abstract, removeWords, "due") | |
# 6. Print image in a new folder: wordcloud | |
plot.dir <- paste(wdir, "wordcloud", sep="/") | |
message(paste("creating new directory: ", plot.dir, sep="")) | |
dir.create(plot.dir) | |
setwd(plot.dir) | |
message(paste("printing file: ", plot.dir, "/wordcloud.png", sep="")) | |
png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent") | |
wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2")) | |
dev.off() | |
# 7. Reset the working directory | |
setwd(wdir) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
GitHub development
This script has been moved to GitHub for further development