-
-
Save knbknb/1638899 to your computer and use it in GitHub Desktop.
Pubmed Queries: last paper on microarray bioinformatics, when to appear?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tidyverse-R-code from 2020 | |
# (this code is also artificially slowed down, with purrr::slowly(), | |
# to avoid HTTP 429 errors (too many requests) | |
library(tidyverse) | |
library(httr) | |
library(xml2) | |
library(lubridate) | |
theme_set(theme_bw()) | |
httr::set_config(httr::config(http_version = 0)) | |
# time frame we are considering: | |
# year 1997 - today | |
start_year <- 1995 | |
#start_year <- 2015 | |
end_year <- lubridate::year( today()) | |
end_prediction <- 2023 # a few years into the future | |
y_max <- 1000 | |
delay_seconds <- 1 | |
loess_smoothness <- 0.5 # 1 - smooth, 0 - wiggly | |
years <- start_year:end_year | |
#terms <- c("term1"="microarray[title]", "term2"="((next generation sequencing[title]) OR (high throughput sequencing[title]))") | |
#term_labels <- c("term1"="microarray", "term2"="next generation sequencing (NGS)") | |
#terms <- c("term1"="COVID-19[title]", "term2"="((MERS[title]) OR (SARS[title]))", "term3"="(influenza[title])") | |
#term_labels <- c("COVID-19"="blue", "MERS/SARS"="red", "Influenza"="brown") | |
#terms <- c("term1"="Zika[title]", "term2"="(Chikungunya[title])", "term3"="(ebola[title])") | |
#term_labels <- c("Zika"="blue", "Chikungunya"="red", "Ebola"="brown") | |
#terms <- c("term1"="Influenza[title]", "term2"="(Malaria[title])", "term3"="(Dengue[title])") | |
#term_labels <- c("Influenza"="blue", "Malaria"="red", "Dengue"="brown") | |
terms <- c("term1"="COVID-19[title] AND superspread*[title]", | |
"term2"="((virus[title]) AND (infectious[title]))", | |
"term3"="(COVID-19[title] AND infecti*[title])") | |
term_labels <- c("COVID-19/superspread*"="blue", "virus/infectious"="red", "COVID-19/infecti*"="brown") | |
base_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
query_params <- list("db" = "pubmed") | |
dfr_lbl <- data.frame( | |
year = c(end_year), | |
value = c(y_max), | |
text = c("current year") | |
) | |
# optional function: show NCPI endpoint url in warnings | |
show_warning <- function(base_url, query_params) { | |
l <- purrr::reduce(map2(names(query_params), query_params, str_c, sep = "="), str_c, sep = "&") | |
l <- str_c(base_url, l, sep = "?") | |
warning(l) | |
} | |
# call purrr::slowly() avoid HTTP error 429 - too many requests | |
throttled_get <- slowly(~ GET(url = .x, query = .y), rate = rate_delay(delay_seconds), quiet = FALSE) | |
get_count <- function(year, x) { | |
# add to list | |
query_params <- x[["query_params"]] | |
term <- x[["term"]] | |
query_params["term"] = sprintf("%s AND %s[pdat]", term, year) | |
resp <- throttled_get(url = x[["base_url"]], query= query_params) | |
xml <- read_html(resp) | |
err <- xml_contents(xml) %>% | |
xml2::xml_find_first(x = ., xpath = ".//error/text()") %>% | |
as.character() %>% str_length() %>% coalesce(., 0) | |
if(err) return(-1) | |
rv <- xml_contents(xml) %>% | |
xml2::xml_find_first(x = ., xpath = ".//count/text()") %>% | |
as.character() %>% | |
as.integer() | |
rv | |
} | |
#show_warning(base_url, query_params) | |
argslist <- map(c("term1", "term2", "term3"), function(x, terms){ | |
list("base_url" = base_url, | |
"term" = terms[[x]], | |
"query_params" = query_params) | |
}, terms) | |
ncbi_data <- tibble( | |
type = "observed", `year` = years, | |
term1 = map(years, get_count, argslist[[1]]), | |
term2 = map(years, get_count, argslist[[2]]), | |
term3 = map(years, get_count, argslist[[3]]) | |
) %>% | |
unnest(c(term1, term2, term3)) | |
dfr <- ncbi_data %>% | |
filter(year >= start_year) | |
mdf <- pivot_longer(dfr, cols=c("term1", "term2", "term3"), names_to = "search_term") | |
mdf | |
format_my_plot <- function(gg, maxval_y, to_year = end_year){ | |
# some constant values are defined out of the scope of this function | |
gg + | |
geom_vline(xintercept = end_year, linetype = "dashed", size = 0.5, color ="dodgerblue") + | |
geom_text(data = dfr_lbl, aes(x = year, y = value, label= text), color = "dodgerblue", hjust = 1.1) + | |
scale_y_continuous(breaks = seq(from = 0, to = maxval_y %/% 200 * 200 + 200, by = 200), | |
limits = c(0, y_max)) + | |
scale_x_continuous(breaks = seq(from = start_year, to = to_year, by = 2), | |
limits = c(start_year, end_prediction)) + | |
stat_smooth(aes(y = value, color = search_term), | |
data = mdf, #subset(mdf, search_term == "term3"), | |
method = "loess", se = FALSE, span = loess_smoothness) + | |
theme(axis.text.x = element_text(size = 7), | |
legend.position = "bottom") + | |
scale_color_manual(labels = names(term_labels), values = as.character(term_labels)) + | |
guides(color=guide_legend("Search terms")) + # add guide properties by aesthetic | |
labs(y = "# papers / year", | |
title = "Pubmed Search: Comparing search terms: ", | |
subtitle = glue::glue("{ names(term_labels[1]) }, { names(term_labels[2]) }, { names(term_labels[3]) }\nNumbers of articles published, { min(years) }-{ max(years) }"), | |
caption = "Source: NCBI/PubMed") | |
} | |
p <- ggplot(mdf, aes(x = year)) | |
p <- p + geom_point(aes(y = value, color = search_term), size = 3) | |
mx <- max(mdf$value) | |
mx <- min(y_max, mx) | |
p <- format_my_plot(p, mx) | |
print(p) | |
# let's peek into the future | |
mdl_loess_term1 <- loess(term1 ~ year, data = dfr, span = loess_smoothness, | |
control = loess.control(surface = "direct")) | |
# when will it stop? | |
term1_predict <- predict( | |
mdl_loess_term1, | |
newdata = tibble( | |
year = (max(years) + 1):end_prediction) | |
) %>% | |
as.integer() | |
# predict term2 growth | |
extrap_years <- tibble(year = (max(years) + 1):end_prediction) | |
mdl_loess_term2 <- loess(term2 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness) | |
term2_predict <- predict(mdl_loess_term2, | |
newdata = extrap_years) %>% | |
as.integer() | |
mdl_loess_term3 <- loess(term3 ~ year, dfr, control = loess.control(surface = "direct"), span = loess_smoothness) | |
term3_predict <- predict(mdl_loess_term3, | |
newdata = extrap_years) %>% | |
as.integer() | |
pred_df <- tibble(type = "predicted", | |
year = extrap_years$year, | |
term1 = term1_predict, | |
term2 = term2_predict, | |
term3 = term3_predict) | |
df2 <- bind_rows(dfr, pred_df) | |
n_curr <- dfr %>% filter(year == end_year) %>% pull(term1) | |
dfr_lbl <- dfr_lbl %>% | |
mutate(text = glue::glue("{text} ({end_year}):\nn={n_curr} papers about {names(term_labels[1])}")) | |
mdf2 <- pivot_longer(df2, cols=c("term1", "term2", "term3"), names_to = "search_term") | |
mx2 <- max(mdf2$value) | |
p2 <- ggplot(mdf2, aes(x = year)) | |
p2 <- p2 + geom_point(aes(y = value, color = search_term, shape = type), size = 3) | |
p2 <- format_my_plot(p2, mx2, to_year = end_prediction) | |
print(p2) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("plyr") | |
library("reshape2") | |
library("XML") | |
library("ggplot2") | |
#Concatenate SQL-style | |
concat<-function(...,sep="",collapse=NULL){ | |
strings<-list(...) | |
#NULL, NA | |
if( | |
all(unlist(llply(strings,length))>0) | |
&& | |
all(!is.na(unlist(strings))) | |
){ | |
do.call("paste", c(strings, list(sep = sep, collapse = collapse))) | |
}else{ | |
NULL | |
} | |
} | |
getCount<-function(term){function(year){ | |
nihUrl<-concat("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=",term," AND ",year,"[pdat]") | |
#cleanurl<-gsub('\\]','%5D',gsub('\\[','%5B',x=url)) | |
#http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=microarray%5btitle%5d+2003%5bpdat%5d | |
#xml<-xmlTreeParse(URLencode(nihUrl),isURL=TRUE) | |
xml<-xmlTreeParse(nihUrl,isURL=TRUE) | |
#Data Mashups in R, pg17 | |
as.numeric(xmlValue(xml$doc$children$eSearchResult$children$Count$children$text)) | |
}} | |
years<-1995:2015 | |
df<-data.frame(type="obs",year=years, | |
mic=sapply(years,function(x){do.call(getCount('microarray[title]'),list(x))}), | |
ngs=sapply(years,function(x){do.call(getCount('((next generation sequencing[title]) OR (high throughput sequencing[title]))'),list(x))}) | |
) | |
df | |
#97 is a fair start | |
df<-subset(df,year>=1997) | |
mdf<-melt(df,id.vars=c("type","year"),variable_name="citation") | |
mdf | |
c<-ggplot(mdf,aes(x=year)) | |
p<-c+geom_point(aes(y=value,color=variable),size=3) + | |
ylab("# papers / year") + | |
stat_smooth(aes(y=value,color=variable),data=subset(mdf,variable=="mic"),method="loess") + | |
scale_x_continuous(breaks=seq(from=1997,to=max(years),by=2)) | |
print(p) | |
#Return 0 for negative elements | |
# noNeg(c(3,2,1,0,-1,-2,2)) | |
# [1] 3 2 1 0 0 0 2 | |
noNeg<-function(v){sapply(v,function(x){max(x,0)})} | |
#Return up to the first negative/zero element inclusive | |
# toZeroNoNeg(c(3,2,1,0,-1,-2,2)) | |
# [1] 3 2 1 0 | |
toZeroNoNeg<-function(v){noNeg(v)[1:firstZero(noNeg(v))]} | |
#return index of first zero | |
firstZero<-function(v){which(noNeg(v)==0)[1]} | |
#let's peer into the future | |
#df.lo.mic<-loess(mic ~ year,df,control=loess.control(surface="direct")) | |
df.lo.mic<-loess(mic ~ year,data=df,control=loess.control(surface="direct")) | |
#when will it stop? | |
mic_predict<-as.integer(predict(df.lo.mic,data.frame(year=(max(years)+1):2035),se=FALSE)) | |
zero_year<-max(years)+firstZero(mic_predict) | |
cat(concat("LOESS projects ",sum(toZeroNoNeg(mic_predict))," more damn microarray papers.")) | |
cat(concat("The last damn microarray paper is projected to be in ",(zero_year-1),".")) | |
#predict ngs growth | |
df.lo.ngs<-loess(ngs ~ year,df,control=loess.control(surface="direct")) | |
ngs_predict<-as.integer(predict(df.lo.ngs,data.frame(year=(max(years)+1):zero_year),se=FALSE)) | |
head(df,1) | |
pred_df<-data.frame(type="pred",year=c((max(years)+1):zero_year),mic=c(toZeroNoNeg(mic_predict)),ngs=ngs_predict) | |
df2<-rbind(df,pred_df) | |
mdf2<-melt(df2,id.vars=c("type","year"),variable_name="citation") | |
c2<-ggplot(mdf2,aes(x=year)) | |
p2 <- c2+geom_point(aes(y=value,color=variable,shape=type),size=3) + | |
ylab("# papers / year") + | |
scale_y_continuous(breaks=seq(from=0,to=max(mdf2$value) %/% 200 * 200 + 200, by=200))+ | |
scale_x_continuous(breaks=seq(from=1997,to=zero_year,by=2)) | |
print(p2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment