jrherr · August 29, 2015 14:27
diff --git a/genome_plots.r b/genome_plots.r
 source("http://bioconductor.org/biocLite.R")
 biocLite("genomes")

 library(genomes)
 library(ggplot2)

 valid <- c("released", "created", "submitted")

 data(proks)
 update(proks)
 pn <- which(names(proks) %in% valid)[1]
 pt <- table(c(proks[, pn], Sys.Date()))
 pd <- data.frame("date" = as.Date(names(pt)), "genomes"=as.numeric(pt))
 pd$clade = 'prokaryotes'

 data(euks)
 update(euks)
 en <- which(names(euks) %in% valid)[1]
 et <- table(c(euks[, en], Sys.Date()))
 ed <- data.frame("date" = as.Date(names(et)), "genomes"=as.numeric(et))
 ed$clade = 'eukaryotes'

 data(virus)
 update(virus)
 vn <- which(names(virus) %in% valid)[1]
 vt <- table(c(virus[, vn], Sys.Date()))
 vd <- data.frame("date" = as.Date(names(vt)), "genomes"=as.numeric(vt))
 vd$clade = 'viruses'

 theme_set(theme_gray(base_size = 28))

 # plots of the number of sequenced genomes accumulating over time
 ggplot(vd, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Viruses")
 ggplot(ed, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Eukaryotes")
 ggplot(pd, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Prokaryotes")

 # plots of genome size against number of protein coding genes
 ggplot(euks, aes(x = proteins, y = size)) + geom_point(alpha = 0.5, size=4) + ylim(c(0,3500)) + xlim(c(0,70000)) + ggtitle("Eukaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")
 ggplot(proks, aes(x = proteins, y = size)) + geom_point(alpha = 0.33, size=4) + ylim(c(0,17)) + ggtitle("Prokaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")
 ggplot(virus, aes(x = proteins, y = size)) + geom_point(alpha = 0.33, size=4)  + ylim(c(0,300)) + xlim(c(0,400)) + ggtitle("Viruses") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")


 # to answer a twitter question (https://twitter.com/potatodoctor/status/630281782315909120), we need to look more closely at animals
 ggplot(euks[which(euks$group=="Animals"),], aes(x = proteins, y = size)) + geom_point(alpha = 0.95, size=4, aes(colour=subgroup)) + ylim(c(0,3500)) + xlim(c(0,70000)) + ggtitle("Animals") + xlab("Number of protein coding genes") + ylab("Genome size (MB)") + scale_colour_brewer(palette="Set1")

 # another question was what the R-squared on the Prokaryote plot is (https://twitter.com/razibkhan/status/630278982521323520)
 # note that to do this properly we'd need to control for non-independence due to relatedness, but we can get a rough idea like this
 m1 = lm(proks$size~proks$genes)
 summary(m1)

 # that R-squared looks high for the plot. However, if you re-draw the plot with smaller points, you can see that most of the points cluster RIGHT on the line
 ggplot(proks, aes(x = proteins, y = size)) + geom_point(alpha = 0.05, size=1.0) + ylim(c(0,17)) + ggtitle("Prokaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")
	source("http://bioconductor.org/biocLite.R")
	biocLite("genomes")

	library(genomes)
	library(ggplot2)

	valid <- c("released", "created", "submitted")

	data(proks)
	update(proks)
	pn <- which(names(proks) %in% valid)[1]
	pt <- table(c(proks[, pn], Sys.Date()))
	pd <- data.frame("date" = as.Date(names(pt)), "genomes"=as.numeric(pt))
	pd$clade = 'prokaryotes'

	data(euks)
	update(euks)
	en <- which(names(euks) %in% valid)[1]
	et <- table(c(euks[, en], Sys.Date()))
	ed <- data.frame("date" = as.Date(names(et)), "genomes"=as.numeric(et))
	ed$clade = 'eukaryotes'

	data(virus)
	update(virus)
	vn <- which(names(virus) %in% valid)[1]
	vt <- table(c(virus[, vn], Sys.Date()))
	vd <- data.frame("date" = as.Date(names(vt)), "genomes"=as.numeric(vt))
	vd$clade = 'viruses'

	theme_set(theme_gray(base_size = 28))

	# plots of the number of sequenced genomes accumulating over time
	ggplot(vd, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Viruses")
	ggplot(ed, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Eukaryotes")
	ggplot(pd, aes(x = date, y=cumsum(genomes))) + geom_line(size = 3, colour='red') + xlab("Year") + ylab("Number of sequenced genomes") + ggtitle("Prokaryotes")

	# plots of genome size against number of protein coding genes
	ggplot(euks, aes(x = proteins, y = size)) + geom_point(alpha = 0.5, size=4) + ylim(c(0,3500)) + xlim(c(0,70000)) + ggtitle("Eukaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")
	ggplot(proks, aes(x = proteins, y = size)) + geom_point(alpha = 0.33, size=4) + ylim(c(0,17)) + ggtitle("Prokaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")
	ggplot(virus, aes(x = proteins, y = size)) + geom_point(alpha = 0.33, size=4) + ylim(c(0,300)) + xlim(c(0,400)) + ggtitle("Viruses") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")


	# to answer a twitter question (https://twitter.com/potatodoctor/status/630281782315909120), we need to look more closely at animals
	ggplot(euks[which(euks$group=="Animals"),], aes(x = proteins, y = size)) + geom_point(alpha = 0.95, size=4, aes(colour=subgroup)) + ylim(c(0,3500)) + xlim(c(0,70000)) + ggtitle("Animals") + xlab("Number of protein coding genes") + ylab("Genome size (MB)") + scale_colour_brewer(palette="Set1")

	# another question was what the R-squared on the Prokaryote plot is (https://twitter.com/razibkhan/status/630278982521323520)
	# note that to do this properly we'd need to control for non-independence due to relatedness, but we can get a rough idea like this
	m1 = lm(proks$size~proks$genes)
	summary(m1)

	# that R-squared looks high for the plot. However, if you re-draw the plot with smaller points, you can see that most of the points cluster RIGHT on the line
	ggplot(proks, aes(x = proteins, y = size)) + geom_point(alpha = 0.05, size=1.0) + ylim(c(0,17)) + ggtitle("Prokaryotes") + xlab("Number of protein coding genes") + ylab("Genome size (MB)")