Created
March 16, 2016 15:25
-
-
Save andrewparkermorgan/286f245ef5f5701a8b6e to your computer and use it in GitHub Desktop.
Make coverage plots with splice-junction arcs in R using Bioconductor packages and ggplot2/grid.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rnaseq.R | |
# plots related to RNAseq data | |
library(GenomicAlignments) | |
library(GenomicRanges) | |
library(ggplot2) | |
library(grid) | |
#library(gtable) # for combining plots vertically | |
#' Make a 'sashimi plot': coverage plus splice events | |
#' | |
#' @param aln a named list of \code{GAlignments} objects | |
#' @param tx a \code{GRanges} object, preferably as returned by \code{import.gff3()} | |
#' @param meta a dataframe of extra metadata; linked to samples by column \code{"iid"}, and column | |
#' \code{"reads"} is used to normalize coverage, if available | |
#' @param smooth integer; if >0, size of windows in which to calculate smoothed coverage estimate | |
#' @param min.coverage only show regions with coverage strictly greater than this | |
#' @param min.splices only show splice events with multiplicity strictly greater than this | |
#' @param max.coverage truncate coverage at this value | |
#' @param log.coverage logical; if \code{TRUE}, show coverage in log10 scale | |
#' @param splice.scale numeric vector of length 2 used for drawing splice-junction arcs; just play with it to find good values | |
#' @param colours a named vector of colours used for coverage plots; grey/black is default | |
#' | |
#' @value a \code{grid} object with the completed plot | |
#' | |
sashimiplot <- function(aln, tx, meta = NULL, smooth = 0, min.coverage = 0, min.splices = 0, max.coverage = Inf, log.coverage = FALSE, | |
splice.scale = c(1.5e3, 5), colours = NULL, colour.by = NULL, ...) { | |
if (!inherits(aln[[1]], "GAlignments")) | |
stop("Must supply a list of GAlignments objects.") | |
## set boundaries of region to plot: approx. the range of all transcripts | |
message("Calculating boundaries of region of interest...") | |
zoom <- reduce(tx, min.gapwidth = 100e4) | |
if (smooth > 0) { | |
## calculated smoothed coverage | |
message("Smoothing coverage in windows of size ", smooth, " bp...") | |
bins <- seq( start(zoom), end(zoom), smooth ) | |
bins.gr <- GRanges(seqnames = as.vector(seqnames(zoom))[1], | |
ranges = IRanges(start = bins[ -length(bins) ], end = bins[-1]-1)) | |
cvg <- ldply(aln, function(x) { | |
transform(as.data.frame(bins.gr), score = countOverlaps(bins.gr, x)) | |
}) | |
#cvg <- ldply(bins, as.data.frame) | |
colnames(cvg)[1] <- "iid" | |
cvg$panel <- cvg$iid | |
} | |
else { | |
## calculate raw coverage | |
message("Estimating read coverage...") | |
cvg <- lapply(lapply(aln, coverage), as, "GRanges") | |
cvg <- ldply(cvg, as.data.frame) | |
colnames(cvg)[1] <- "iid" | |
cvg$panel <- cvg$iid | |
} | |
## discover splice junctions | |
message("Discovering splice junctions...") | |
sj <- lapply(aln, summarizeJunctions) | |
## ignore splices to out-of-range places | |
sj <- lapply(sj, subsetByOverlaps, zoom, type = "within") | |
## make swoop shapes representing splicing events | |
sj <- ldply(sj, as.data.frame) | |
swoops <- make.swoop(sj, scale = splice.scale) | |
swoops$panel <- swoops$.id | |
## add metadata, if any | |
if (!is.null(meta)) | |
cvg <- merge(cvg, meta, all.x = TRUE) | |
## rescale by total coverage, if provided | |
if (!is.null(meta$reads)) | |
cvg$depth <- with(cvg, score/(reads/1e7)) | |
else | |
cvg$depth <- cvg$score | |
## set colours for coverage panels, if not provided | |
if (is.null(colours)) { | |
colours <- rep("darkgrey", length(aln)) | |
names(colours) <- names(aln) | |
} | |
## set x-axis limits to cover region of interest | |
xlims <- range(as.vector(ranges(zoom))) | |
## draw coverage plot with splice events | |
cvg <- subset(cvg, depth > min.coverage) | |
cvg$depth <- pmin(cvg$depth, max.coverage) | |
swoops <- subset(swoops, score > min.splices) | |
message("Rendering plots...") | |
p0 <- ggplot(cvg) + | |
geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = depth, fill = iid)) + | |
geom_line(data = swoops, aes(x = x, y = y, group = row, colour = .id)) + | |
scale_x_continuous(limits = xlims) + | |
scale_fill_manual(values = colours) + | |
scale_colour_manual(values = colours) + | |
guides(fill = FALSE, colour = FALSE) + | |
facet_grid(panel ~ .) + | |
ylab("coverage (reads per 10 million)\n") + | |
theme_gbrowse() + theme(axis.text.x = element_blank(), axis.title.x = element_blank()) | |
if (log.coverage) | |
p0 <- p0 + scale_y_continuous("coverage", trans = "log10") | |
## draw transcript ideograms | |
p1 <- ggplot() + | |
geom_tx(tx[ tx$type == "exon" ], at = 0, height = 5, fill = "grey50", colour.by = colour.by, ...) + | |
scale_x_continuous("\nposition (Mb)", limits = xlims, labels = function(x) x/1e6) + | |
scale_fill_manual(values = c("grey50", "black", "blue")) + | |
facet_grid(panel ~ .) + | |
guides(fill = FALSE) + | |
ylab("") + | |
theme_gbrowse() + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank()) | |
## combine plots | |
rez <- gtable:::rbind_gtable( ggplotGrob(p0), ggplotGrob(p1), size = "first" ) | |
## done | |
return(rez) | |
} | |
#' Use Bezier curves to make 'swoop' shapes represnting splice junctions | |
make.swoop <- function(df, scale = c(1.5e3, 5), ...) { | |
df$row <- seq_len(nrow(df)) | |
ddply(df, .(row), function(d) { | |
w <- with(d, end-start)[1] | |
mid <- c(0.2,0.8)*w+d$start[1] | |
x <- c(d$start[1], mid, d$end[1]) | |
y <- c(0, -1*rep(pmax(w/scale[1], scale[2]), 2), 0) | |
rez <- as.data.frame(Hmisc::bezier(x, y)) | |
for (c in colnames(d)) { | |
rez[,c] <- d[1,c] | |
} | |
return(rez) | |
}) | |
} | |
#' Render a GRanges of exons into a transcript ideogram | |
#' | |
#' @param exons a \code{GRanges} containing exons; metadata column \code{"Parent"} groups them into distinct transcripts | |
#' @param at vertical offset for first transcript | |
#' @param height vertical size of each transcript | |
#' @param arrows if arrows desired along intron connectors (to show strand orientation), this should be a call | |
#' to \code{grid::arrow()} returning the desired sort of arrow | |
#' @param do.introns if \code{TRUE}, draw lines through introns to connect exons | |
#' @param colour line colour for intron connectors | |
#' @param colour.by name of metadata column to use for exon boxes | |
#' @param fill fill colour for exon boxes, if \code{colour.by} not specified | |
#' @param stroke line colour for borders of exon boxes | |
#' | |
#' @value a list of \code{ggplot2} geoms, which can be added to an existing plot with \code{`+`}. | |
#' | |
#' @details Designed for use with the output of \code{rtracklayer::import.gff3()} with GFFs from Ensembl. For | |
#' other use cases, mileage may vary. | |
#' | |
geom_tx <- function(exons, at = 0, height = 1, arrows = grid::arrow(length = unit(4, "pt"), type = "closed"), | |
do.introns = TRUE, colour.by = NULL, stroke = NA, colour = "black", fill = "black", | |
label = FALSE, label.size = 3, drop = TRUE, ...) { | |
.make.tx.df <- function(ex) { | |
if (!length(ex)) | |
return(NULL) | |
if (drop) | |
values(ex)$at <- as.numeric(ex$parent)*height + at | |
else | |
values(ex)$at <- as.numeric(ex$parent) | |
values(ex)$height <- height | |
if (drop) | |
exx <- droplevels(as.data.frame(ex)) | |
else | |
exx <- as.data.frame(ex) | |
exx$panel <- "transcripts" | |
if (is.null(colour.by) || is.na(colour.by)) { | |
rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height*0.4, ymax = at+height*0.4), | |
fill = fill, colour = stroke, ...) ) | |
} | |
else { | |
exx$fill <- exx[ ,colour.by ] | |
rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height*0.4, ymax = at+height*0.4, fill = fill), | |
colour = stroke, ...) ) | |
} | |
if (label) { | |
labels <- data.frame(xpos = max(exx$end) + 1000, ypos = exx$at[1]+exx$height[1]*0.4, label = exx$parent[1], panel = exx$panel[1]) | |
rez[[3]] <- geom_text(data = labels, aes(x = xpos, y = ypos, label = label), size = label.size, hjust = 0) | |
} | |
if (length(ex) > 1) { | |
## get exons from introns | |
if (min(start(ex)) > 0) | |
.introns <- GenomicRanges::gaps(ex)[-1] | |
else | |
.introns <- GenomicRanges::gaps(ex) | |
if (drop) | |
introns <- droplevels(as.data.frame(.introns)) | |
else | |
introns <- as.data.frame(.introns) | |
#message(nrow(introns), " introns...") | |
if (any(strand(ex) == "-")) { | |
x <- introns$start | |
introns$start <- introns$end | |
introns$end <- x | |
} | |
#print(introns) | |
introns$at <- ex$at[1] | |
introns$height <- height | |
introns$panel <- "transcripts" | |
#if (drop) | |
# introns$middle <- with(introns, at+height*0.8/2) | |
#else | |
# introns$middle <- with(introns, at + as.numeric(parent)) | |
rez <- c(rez, geom_segment(data = introns, aes(x = start, xend = end, y = at, yend = at), | |
arrow = arrows, colour = colour, ...)) | |
} | |
return(rez) | |
} | |
if (!inherits(exons, "GRanges")) | |
exons <- makeGRangesFromDataFrame(exons, keep.extra.columns = TRUE) | |
## if input was GFF3, do everything on a per-tx basis | |
if (!is.null(exons$Parent)) { | |
if (is.null(exons$parent)) { | |
## rtracklayer::import.gff3() uses list format for Parent field; assume 1 parent, listed first | |
if (inherits(exons$Parent,"List")) | |
exons$parent <- sapply(exons$Parent, "[", 1) | |
else | |
exons$parent <- as.character(exons$parent) | |
} | |
else { | |
## respect existing 'parent' column | |
} | |
## 'parent' needs to be a factor; force this to be so | |
if (!is.factor(exons$parent)) | |
exons$parent <- reorder(factor(exons$parent), start(exons), min) | |
} | |
else { | |
exons$parent <- 1 | |
} | |
exons <- exons[ order(exons$parent) ] | |
exl <- split(exons, exons$parent) | |
lapply(exl, .make.tx.df) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment