Skip to content

Instantly share code, notes, and snippets.

@blmoore
Last active June 28, 2018 15:00
Show Gist options
  • Save blmoore/9998523 to your computer and use it in GitHub Desktop.
Save blmoore/9998523 to your computer and use it in GitHub Desktop.
For blog post
options(PlosApiKey = "<insert your API key here!>")
#install_github("rplos", "ropensci")
library("rplos")
library("ggplot2")
require("dplyr")
# Convert author strings to counts
countAuths <- function(cell)
length(unlist(strsplit(cell, ";")))
countAuths <- Vectorize(countAuths)
# Query PLoS API for 1k papers per journal per year,
# count the number of authors and return a data.frame
getAuths <- function(j, lim=1000, start.year=2006){
cat("Getting results for journal: ", j, "\n")
# seem to be in reverse order by year?
results <- sapply(start.year:2013, function(i) data.frame(year = i,
auths = searchplos(
q = paste0('publication_date:[', i,
'-01-01T00:00:00Z TO ', i,
'-12-31T23:59:59Z]'),
fl = "author",
fq = list("doc_type:full",
paste0("cross_published_journal_key:", j)),
start=0, limit=lim, sleep=6),
year=i), simplify=F)
results <- do.call(rbind, results)
results$counts <- countAuths(results$author)
results$journal <- j
results
}
journals <- journalnamekey()
plos.all <- sapply(journals[c(1:5, 7)], getAuths, simplify=F)
plos <- do.call(rbind, plos.all)
# Fig. 1: Bean plot showing distribution of author counts
# per journal overall
ggplot(plos, aes(x=journal, y=counts, fill=journal)) +
geom_violin(scale="width") +
geom_boxplot(width=.12, fill=I("black"), notch=T,
outlier.size=NA, col="grey40") +
stat_summary(fun.y="median", geom="point", shape=20, col="white") +
scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
coord_flip() + labs(x="", y="Number of authors per paper") +
theme_classic() + theme(legend.position="none") +
scale_fill_brewer()
# Fig 2. ECDFs of the author count distributions
# 5in x 5in
ggplot(plos, aes(x=counts, col=journal)) +
stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() +
scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
theme(legend.position=c(.75,.33)) +
labs(x="Number of authors per paper", y="ECDF",
col="") + coord_cartesian(xlim=c(1,300)) +
scale_color_brewer(type="qual", palette=6)
# Fig 3. Trends in author counts over time with
# confidence limits on the means
# 7 x 7
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
stat_summary(fun.data="mean_cl_boot", geom="ribbon",
width=.2, alpha=I(.5)) +
stat_summary(fun.y="mean", geom="line") +
labs(list(x="Year", y="Mean number of authors per paper")) +
theme_bw() + theme(legend.position=c(.2,.85)) +
scale_fill_brewer(type="qual", palette=2,
guide=guide_legend(direction="vertical",
label.position="bottom",
title=NULL, ncol=2,
label.hjust=0.5)) +
scale_color_brewer(type="qual", palette=2, guide="none")
# from http://stackoverflow.com/a/17024184/1274516
# show regression equation on each graph facet
lm_eqn <- function(df){
m <- summary(lm(counts ~ year, df))
eq <- substitute(~~y~"="~beta*x+i~(R^2==r2),
list(beta = format(m$coefficients[2,"Estimate"],
digits = 3),
i = format(m$coefficients[1,"Estimate"], digits=3),
r2 = format(m$r.squared, digits=2)))
as.character(as.expression(eq))
}
means <- group_by(plos, journal, year) %.% summarise(counts=mean(counts))
b <- by(means, means$journal, lm_eqn)
df <- data.frame(beta=unclass(b), journal=names(b))
summary(lm(counts ~ year + journal, data=means))
means <- group_by(means, journal) %.% summarise(m=max(counts))
df$top <- means$m * 1.2
# Fig 4. Facetted linear regression of author inflation per journal
# 6 x 8.5
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
stat_summary(fun.data="mean_cl_boot", geom="errorbar",
width=.2, alpha=I(.5)) +
stat_summary(fun.y="mean", geom="point") +
#stat_summary(fun.y="median", geom="point", shape=4) +
facet_wrap(~journal, scales="free_y") +
geom_smooth(method="lm") +
scale_x_continuous(breaks=2006:2013) +
labs(list(x="", y="Mean number of authors per paper")) +
theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_fill_brewer(type="qual", palette=2, guide="none") +
scale_color_brewer(type="qual", palette=2, guide="none") +
geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T)
# Overall estimate of author inflation?
# .21 extra authors per paper per year, on average
s <- summary(lm(counts ~ year + journal, data=plos))
# Summary barchart data:
bc <- data.frame(journal = unique(means$journal),
trend = c(0.2490979,
0.1211823,
0.5201688,
0.4088536,
0.05894102,
0.1828939),
std.err = c(0.08224567,
0.02213142,
0.1493662,
0.06361849,
0.03891493,
0.03798822),
IF = c(12.690,
4.867,
8.517,
15.253,
3.730,
8.136))
bc$journal <- factor(bc$journal, levels=bc$journal[order(bc$trend)])
# Fig 5. Barchart of author inflation estimate per journal.
# 7 x 5
ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend-std.err,
ymax=trend+std.err)) +
geom_bar(stat="identity") +
geom_errorbar(width=.2) +
scale_y_continuous(expand=c(0,0)) +
theme_classic() +
labs(x="",
y="Estimate of annual author inflation (additional mean authors per paper)") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_fill_brewer(palette="Blues", guide="none")
pcc <- cor(bc$trend, bc$IF)
# Fig 6. Correlation of author inflation and journal impact factors.
# 5 x 5
ggplot(bc, aes(x=trend, y=IF, col=journal)) +
geom_text(aes(label=journal)) + xlim(0,.6) +
labs(x="Author inflation estimate",
y="Journal impact factor (2012)") +
scale_color_brewer(type="qual", palette=2, guide="none") +
annotate("text", x=.05, y=15,
label=paste0("rho == ", format(pcc, digits=2)), parse=T)
# N.S. (p = 0.18)
cor.test(bc$trend, bc$IF)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment