Skip to content

Instantly share code, notes, and snippets.

@cldotdev
Last active December 11, 2015 18:29
Show Gist options
  • Select an option

  • Save cldotdev/4642041 to your computer and use it in GitHub Desktop.

Select an option

Save cldotdev/4642041 to your computer and use it in GitHub Desktop.
###########################################################################################
## Compute N50 values for Assembly Results and Plot their Cumulative Length Distribution ##
###########################################################################################
## Author: Thomas Girke
## Last update: January 28, 2010
## Utilities: Compute N50 and plot cumulative length distribution.
## More details can be found here: http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Analyzing-Assembly-Results
## Overview:
## The N50 is a weighted median statistic such that 50% of the entire assembly
## is contained in contigs equal to or larger than this value. It can be calculated
## as follows: store the lengths of all contigs in a vector L of positive integers.
## Create a vector Lr where each integer is repeated as many times as defined by
## the sum its values in L. After this the N50 value is computed as the median of Lr.
## For example: if L = c(2, 2, 3, 3, 5, 7), then Lr consists of four 2's, six 3's,
## five 5's, and seven 7's; the N50 of L is the median of Lr, which is 5.
## In R the N50 according to this definition can be computed like this:
## L <- c(2, 2, 3, 3, 5, 7)
## Lr <- unlist(tapply(L, L, function(x) rep(x[1], sum(x))))
## median(Lr)
##
## With very large contig sizes the above implementation for computing the N50
## value can become inefficient. A much more efficient approach is to sort L
## decreasingly, compute for it the cumulative sum and then identify the length
## value for which the cumulative sum covers at least 50% of the combined length
## of all contigs or the length of a known target/reference sequence. Note,
## comparisons among N50 values from different assemblies are only meaningful
## when using for their calculation the same combined length value. Thus, a known
## target length value can be often a good solution for comparing assembly results.
## The following contigStats function calculates the N50 values according to the more
## efficient second method. In addition, it creates a distribution plot of the
## cumulative contig lengths, which is a very effective way for comparing assembly
## results.
contigStats <- function(N=N,
reflength,
style="base",
pch=20,
cex=1,
xlab="Percentage of Assembly Covered by Contigs of Size >=Y",
ylab="Contig Size [bp]",
main="Cumulative Length of Contigs",
sizetitle=14,
sizex=12,
sizey=12,
sizelegend=9,
poslegend="topright",
xlim,
ylim) {
## Compute cumulative length vectors for contig sets
Nl <- lapply(names(N), function(x) rev(sort(N[[x]]))); names(Nl) <- names(N)
Nlcum <- lapply(names(Nl), function(x) cumsum(Nl[[x]])); names(Nlcum) <- names(Nl)
## Compute N50 values
N50 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x]/2 >= 0)[1]]); names(N50) <- names(N)
## Return only data (no plot)
if(style=="data") {
N75 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.75 >= 0)[1]]); names(N50) <- names(N)
N25 <- sapply(seq(along=N), function(x) Nl[[x]][which(Nlcum[[x]] - reflength[x] * 0.25 >= 0)[1]]); names(N50) <- names(N)
stats <- cbind(N25, N50, N75, Longest=sapply(N, max), Mean=sapply(N, mean), Median=sapply(N, median), Shortest=sapply(N, min), N_Contigs=sapply(N, length))
return(c(Nlcum, Contig_Stats=list(stats)))
}
## Plot cumulative contig length with base graphics
if(style=="base") {
if(missing(xlim)) xlim <- c(0, 100)
if(missing(ylim)) ylim <- c(0, max(unlist(N)))
split.screen(c(1,1))
for(i in seq(along=Nl)) {
if(i==1) {
plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, cex=cex, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main)
}
screen(1, new=FALSE)
plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, cex=cex, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", ylab="", xlab="", main="", bty="n")
}
legend(poslegend, legend=paste(names(N50), ": N50 = ", N50, sep=""), cex=1.2, bty="n", pch=15, pt.cex=1.8, col=seq(along=Nl))
close.screen(all=TRUE)
}
## Plot cumulative contig length with ggplot2
## Note: ggplot2 plotting options can be looked up with theme_get()
if(style=="ggplot2") {
require("ggplot2")
plotdf <- data.frame(Samples=rep(names(Nlcum), sapply(Nlcum, length)), length=unlist(Nl), perc=unlist(lapply(names(Nlcum), function(x) Nlcum[[x]]/reflength[[x]]*100)))
counts <- table(plotdf[,1]); counts <- counts[names(N50)]
N50rep <- paste(plotdf[,1], ": N50=", unlist(lapply(as.character(unique(plotdf[,1])), function(x) rep(N50[x], counts[names(N50[x])]))), sep="")
plotdf[,1] <- N50rep
ggplot(plotdf, aes(perc, length, color=Samples)) +
geom_point() +
scale_x_continuous(xlab) +
scale_y_continuous(ylab) +
labs(title = main) +
labs(plot.title = element_text(size = sizetitle)) +
labs(axis.title.x = element_text(size = sizex)) +
labs(axis.title.y = element_text(size = sizey, angle = 90)) +
labs(legend.text = element_text(size = sizelegend)) +
labs(legend.title = element_text(size = 0))
}
}
## Usage of contigStats function
## Test sample of simulated contig length values provided in list
# N1 <- sample(1:500, 100)
# N2 <- sample(1:480, 50)
# N <- list(N1=N1, N2=N2)
## Run contigStats function
# reflength <- sapply(N, sum)
# N50 <- contigStats(N=N, reflength=reflength, pch=20, xlab="Percentage of Assembly Covered by Contigs of Size >=Y", ylab="Contig Size", main="Cumulative Plot of N Statistic")
# Cumulative plot
# Derived from contigStats.R (Author: Thomas Girke)
# Created: 2013.1.27
#
# Usage:
# > N1 <- sample(1:500, 100)
# > N2 <- sample(1:480, 50)
# > N <- list(N1=N1, N2=N2)
# > reflength <- sapply(N, sum)
# > cmplot <- cmplots(N=N, reflength=reflength, pch=20, xlab="", ylab="", main="")
cmplots <- function(N=N,
reflength,
pch=20,
cex=1,
xlab="Percentage of Assembly Covered by Value >= Y",
ylab="Value",
main="Cumulative Length of Contigs",
sizetitle=14,
sizex=12,
sizey=12,
sizelegend=9,
poslegend="topright",
xlim,
ylim) {
## Compute cumulative length vectors for contig sets
Nl <- lapply(names(N), function(x) rev(sort(N[[x]]))); names(Nl) <- names(N)
Nlcum <- lapply(names(Nl), function(x) cumsum(Nl[[x]])); names(Nlcum) <- names(Nl)
## Plot cumulative contig length with base graphics
if(missing(xlim)) xlim <- c(0, 100)
if(missing(ylim)) ylim <- c(0, max(unlist(N)))
split.screen(c(1,1))
for(i in seq(along=Nl)) {
if(i==1) {
plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, cex=cex)
}
screen(1, new=FALSE)
plot(Nlcum[[i]]/reflength[[i]] * 100, Nl[[i]], col=i, pch=pch, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n", ylab="", xlab="", main="", bty="n", cex=cex)
}
legend(poslegend, legend=paste(names(N)), cex=1.2, bty="n", pch=15, pt.cex=1.8, col=seq(along=Nl))
close.screen(all=TRUE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment