Last active
June 28, 2018 15:00
-
-
Save blmoore/9998523 to your computer and use it in GitHub Desktop.
For blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
options(PlosApiKey = "<insert your API key here!>") | |
#install_github("rplos", "ropensci") | |
library("rplos") | |
library("ggplot2") | |
require("dplyr") | |
# Convert author strings to counts | |
countAuths <- function(cell) | |
length(unlist(strsplit(cell, ";"))) | |
countAuths <- Vectorize(countAuths) | |
# Query PLoS API for 1k papers per journal per year, | |
# count the number of authors and return a data.frame | |
getAuths <- function(j, lim=1000, start.year=2006){ | |
cat("Getting results for journal: ", j, "\n") | |
# seem to be in reverse order by year? | |
results <- sapply(start.year:2013, function(i) data.frame(year = i, | |
auths = searchplos( | |
q = paste0('publication_date:[', i, | |
'-01-01T00:00:00Z TO ', i, | |
'-12-31T23:59:59Z]'), | |
fl = "author", | |
fq = list("doc_type:full", | |
paste0("cross_published_journal_key:", j)), | |
start=0, limit=lim, sleep=6), | |
year=i), simplify=F) | |
results <- do.call(rbind, results) | |
results$counts <- countAuths(results$author) | |
results$journal <- j | |
results | |
} | |
journals <- journalnamekey() | |
plos.all <- sapply(journals[c(1:5, 7)], getAuths, simplify=F) | |
plos <- do.call(rbind, plos.all) | |
# Fig. 1: Bean plot showing distribution of author counts | |
# per journal overall | |
ggplot(plos, aes(x=journal, y=counts, fill=journal)) + | |
geom_violin(scale="width") + | |
geom_boxplot(width=.12, fill=I("black"), notch=T, | |
outlier.size=NA, col="grey40") + | |
stat_summary(fun.y="median", geom="point", shape=20, col="white") + | |
scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) + | |
coord_flip() + labs(x="", y="Number of authors per paper") + | |
theme_classic() + theme(legend.position="none") + | |
scale_fill_brewer() | |
# Fig 2. ECDFs of the author count distributions | |
# 5in x 5in | |
ggplot(plos, aes(x=counts, col=journal)) + | |
stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() + | |
scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) + | |
theme(legend.position=c(.75,.33)) + | |
labs(x="Number of authors per paper", y="ECDF", | |
col="") + coord_cartesian(xlim=c(1,300)) + | |
scale_color_brewer(type="qual", palette=6) | |
# Fig 3. Trends in author counts over time with | |
# confidence limits on the means | |
# 7 x 7 | |
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) + | |
stat_summary(fun.data="mean_cl_boot", geom="ribbon", | |
width=.2, alpha=I(.5)) + | |
stat_summary(fun.y="mean", geom="line") + | |
labs(list(x="Year", y="Mean number of authors per paper")) + | |
theme_bw() + theme(legend.position=c(.2,.85)) + | |
scale_fill_brewer(type="qual", palette=2, | |
guide=guide_legend(direction="vertical", | |
label.position="bottom", | |
title=NULL, ncol=2, | |
label.hjust=0.5)) + | |
scale_color_brewer(type="qual", palette=2, guide="none") | |
# from http://stackoverflow.com/a/17024184/1274516 | |
# show regression equation on each graph facet | |
lm_eqn <- function(df){ | |
m <- summary(lm(counts ~ year, df)) | |
eq <- substitute(~~y~"="~beta*x+i~(R^2==r2), | |
list(beta = format(m$coefficients[2,"Estimate"], | |
digits = 3), | |
i = format(m$coefficients[1,"Estimate"], digits=3), | |
r2 = format(m$r.squared, digits=2))) | |
as.character(as.expression(eq)) | |
} | |
means <- group_by(plos, journal, year) %.% summarise(counts=mean(counts)) | |
b <- by(means, means$journal, lm_eqn) | |
df <- data.frame(beta=unclass(b), journal=names(b)) | |
summary(lm(counts ~ year + journal, data=means)) | |
means <- group_by(means, journal) %.% summarise(m=max(counts)) | |
df$top <- means$m * 1.2 | |
# Fig 4. Facetted linear regression of author inflation per journal | |
# 6 x 8.5 | |
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) + | |
stat_summary(fun.data="mean_cl_boot", geom="errorbar", | |
width=.2, alpha=I(.5)) + | |
stat_summary(fun.y="mean", geom="point") + | |
#stat_summary(fun.y="median", geom="point", shape=4) + | |
facet_wrap(~journal, scales="free_y") + | |
geom_smooth(method="lm") + | |
scale_x_continuous(breaks=2006:2013) + | |
labs(list(x="", y="Mean number of authors per paper")) + | |
theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) + | |
scale_fill_brewer(type="qual", palette=2, guide="none") + | |
scale_color_brewer(type="qual", palette=2, guide="none") + | |
geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T) | |
# Overall estimate of author inflation? | |
# .21 extra authors per paper per year, on average | |
s <- summary(lm(counts ~ year + journal, data=plos)) | |
# Summary barchart data: | |
bc <- data.frame(journal = unique(means$journal), | |
trend = c(0.2490979, | |
0.1211823, | |
0.5201688, | |
0.4088536, | |
0.05894102, | |
0.1828939), | |
std.err = c(0.08224567, | |
0.02213142, | |
0.1493662, | |
0.06361849, | |
0.03891493, | |
0.03798822), | |
IF = c(12.690, | |
4.867, | |
8.517, | |
15.253, | |
3.730, | |
8.136)) | |
bc$journal <- factor(bc$journal, levels=bc$journal[order(bc$trend)]) | |
# Fig 5. Barchart of author inflation estimate per journal. | |
# 7 x 5 | |
ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend-std.err, | |
ymax=trend+std.err)) + | |
geom_bar(stat="identity") + | |
geom_errorbar(width=.2) + | |
scale_y_continuous(expand=c(0,0)) + | |
theme_classic() + | |
labs(x="", | |
y="Estimate of annual author inflation (additional mean authors per paper)") + | |
theme(axis.text.x=element_text(angle=45, hjust=1)) + | |
scale_fill_brewer(palette="Blues", guide="none") | |
pcc <- cor(bc$trend, bc$IF) | |
# Fig 6. Correlation of author inflation and journal impact factors. | |
# 5 x 5 | |
ggplot(bc, aes(x=trend, y=IF, col=journal)) + | |
geom_text(aes(label=journal)) + xlim(0,.6) + | |
labs(x="Author inflation estimate", | |
y="Journal impact factor (2012)") + | |
scale_color_brewer(type="qual", palette=2, guide="none") + | |
annotate("text", x=.05, y=15, | |
label=paste0("rho == ", format(pcc, digits=2)), parse=T) | |
# N.S. (p = 0.18) | |
cor.test(bc$trend, bc$IF) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment