Last active
March 11, 2021 13:29
-
-
Save aurora-mareviv/697cbb505189591648224ed640e70fb1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################### | |
#### CAPTURE ABSTRACTS FROM PMIDs & MAKE A WORDCLOUD #### | |
######################################################### | |
# GNU-GPL license | |
# Author: Mareviv (https://talesofr.wordpress.com) | |
# Script to retrieve and mine abstracts from PubMed (http://www.ncbi.nlm.nih.gov/pubmed) | |
# Uses the function ReadPubMed from the package RefManageR. It reads the PubMed API similarly to the search engine in PubMed's page. | |
# This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot. | |
# First, automagically install needed libraries: | |
list.of.packages <- c("slam") # installing 'slam' gives error in OSX Yosemite/El Capitan. This is an attempt to fix it, specifying 'type="binary"'. | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/', type="binary") | |
list.of.packages <- c("RCurl", "RefManageR", "plyr", "tm", "wordcloud", "SnowballC") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages, repos='https://cran.rstudio.com/') | |
# Get and store the working directory | |
wdir <- getwd() | |
# 1. Import PMIDs | |
message("retrieving PMIDs info...") | |
library(RCurl) | |
urli <- getURL("https://gist.githubusercontent.com/aurora-mareviv/14e5837814a8d8d47c20/raw/90b198bae82154688dcd9a2596af798612e6619f/pmids.csv", ssl.verifypeer = FALSE) | |
pmids <- read.csv(textConnection(urli)) | |
message("PMID info succesfully retrieved") | |
# 2. Loop several queries to PubMed and return in a data.frame | |
index <- pmids$pmId[1:length(pmids$pmId)] | |
# The PubMed (free) API may give problems with large queries, so we'll prefer a shorter vector for this test: | |
index50 <- pmids$pmId[1:50] | |
library(RefManageR) | |
library(plyr) | |
message("connecting to the free PubMed API...") | |
auth.pm <- ldply(index50, function(x){ | |
tmp <- unlist(ReadPubMed(x, database = "PubMed", mindate = 1950)) | |
tmp <- lapply(tmp, function(z) if(is(z, "person")) paste0(z, collapse = ",") else z) | |
data.frame(tmp, stringsAsFactors = FALSE) | |
}) | |
message("abstract data successfully downloaded!") | |
# 3. Create a directory to write the abstracts.txt file into: (this folder can only contain this .txt file!) | |
corpus.dir <- paste(wdir, "corpus1", sep="/") | |
message(paste("creating new directory: ", corpus.dir, sep="")) | |
dir.create(corpus.dir) | |
setwd(corpus.dir) | |
# 4. Extract abstracts to a .txt | |
text <- paste(auth.pm$abstract) | |
message(paste("writing file: ", corpus.dir, "/abstracts.txt", sep="")) | |
writeLines(text, "abstracts.txt") | |
# 5. Create tagcloud | |
library(tm) | |
library(wordcloud) | |
library(SnowballC) | |
message("constructing the tagcloud...") | |
abstract <- Corpus (DirSource(corpus.dir)) # import text file in this directory | |
abstract <- tm_map(abstract, stripWhitespace) # transformations | |
abstract <- tm_map(abstract, content_transformer(tolower)) | |
abstract <- tm_map(abstract, removeWords, stopwords("english")) | |
# abstract <- tm_map(abstract, stemDocument) # optional in this case | |
abstract <- tm_map(abstract, removeNumbers) # optional in this case | |
abstract <- tm_map(abstract, removePunctuation) | |
# tuning | |
abstract <- tm_map(abstract, removeWords, "methods") | |
abstract <- tm_map(abstract, removeWords, "results") | |
abstract <- tm_map(abstract, removeWords, "conclusions") | |
abstract <- tm_map(abstract, removeWords, "conclusion") | |
abstract <- tm_map(abstract, removeWords, "whether") | |
abstract <- tm_map(abstract, removeWords, "due") | |
# 6. Print image in a new folder: wordcloud | |
plot.dir <- paste(wdir, "wordcloud", sep="/") | |
message(paste("creating new directory: ", plot.dir, sep="")) | |
dir.create(plot.dir) | |
setwd(plot.dir) | |
message(paste("printing file: ", plot.dir, "/wordcloud.png", sep="")) | |
png(file = "wordcloud.png", width = 1500, height = 1500, units = "px", res = 300, bg = "transparent") | |
wordcloud(abstract, scale=c(5,0.5), max.words=150, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2")) | |
dev.off() | |
# 7. Reset the working directory | |
setwd(wdir) | |
GitHub development
This script has been moved to GitHub for further development
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Quick wordclouds from PubMed abstracts - using PMID lists in R
Wordclouds are one of the most visually straightforward, compelling ways of displaying text info in a graph.Of course, we have a lot of web pages (and even apps) that, given an input text, will plot you some nice tagclouds. However, when you need reproducible results, or getting done complex tasks -like combined wordclouds from several files-, a programming environment may be the best option.
In R, there are (as always), several alternatives to get this done, such as tagcloud and wordcloud.
For this script I used the following packages:
RCurl
to retrieve a PMID list, stored in this GitHub account as a .csv file.RefManageR
andplyr
to retrieve and arrange PM records. To fetch the info from the inets, we'll be using the PubMed API (free version, with some limitations).tm
,SnowballC
to prepare the data andwordcloud
to plot the wordcloud. This part of the script is based on this from Georeferenced.One of the advantages of using RefManageR is that you can easily change the field which you are importing from, and it usually works flawlessly with the PubMed API.
My biggest problem sources when running this script: download caps, busy hours, and firewalls!.
At the beginning of the gist, there is also a handy function that automagically downloads all needed packages for you.
To source the script, simply type in the R console:
This script creates two directories in your working directory: 'corpus1' for the abstracts file, and 'wordcloud' to store the plot.