andrewparkermorgan · March 16, 2016 15:25
diff --git a/rnaseq.R b/rnaseq.R
 # rnaseq.R
 # plots related to RNAseq data

 library(GenomicAlignments)
 library(GenomicRanges)
 library(ggplot2)
 library(grid)
 #library(gtable) # for combining plots vertically

 #' Make a 'sashimi plot': coverage plus splice events
 #' 
 #' @param aln a named list of \code{GAlignments} objects
 #' @param tx a \code{GRanges} object, preferably as returned by \code{import.gff3()}
 #' @param meta a dataframe of extra metadata; linked to samples by column \code{"iid"}, and column
 #' 	\code{"reads"} is used to normalize coverage, if available
 #' @param smooth integer; if >0, size of windows in which to calculate smoothed coverage estimate
 #' @param min.coverage only show regions with coverage strictly greater than this
 #' @param min.splices only show splice events with multiplicity strictly greater than this
 #' @param max.coverage truncate coverage at this value
 #' @param log.coverage logical; if \code{TRUE}, show coverage in log10 scale
 #' @param splice.scale numeric vector of length 2 used for drawing splice-junction arcs; just play with it to find good values
 #' @param colours a named vector of colours used for coverage plots; grey/black is default
 #' 
 #' @value a \code{grid} object with the completed plot
 #' 
 sashimiplot <- function(aln, tx, meta = NULL, smooth = 0, min.coverage = 0, min.splices = 0, max.coverage = Inf, log.coverage = FALSE,
 						splice.scale = c(1.5e3, 5), colours = NULL, colour.by = NULL, ...) {
 	
 	if (!inherits(aln[[1]], "GAlignments"))
 		stop("Must supply a list of GAlignments objects.")
 	
 	## set boundaries of region to plot: approx. the range of all transcripts
 	message("Calculating boundaries of region of interest...")
 	zoom <- reduce(tx, min.gapwidth = 100e4)
 	
 	if (smooth > 0) {
 		## calculated smoothed coverage
 		message("Smoothing coverage in windows of size ", smooth, " bp...")
 		bins <- seq( start(zoom), end(zoom), smooth )
 		bins.gr <- GRanges(seqnames = as.vector(seqnames(zoom))[1],
 						ranges = IRanges(start = bins[ -length(bins) ], end = bins[-1]-1))
 		cvg <- ldply(aln, function(x) {
 			transform(as.data.frame(bins.gr), score = countOverlaps(bins.gr, x))
 		})
 		#cvg <- ldply(bins, as.data.frame)
 		colnames(cvg)[1] <- "iid"
 		cvg$panel <- cvg$iid
 	}
 	else {
 		## calculate raw coverage
 		message("Estimating read coverage...")
 		cvg <- lapply(lapply(aln, coverage), as, "GRanges")
 		cvg <- ldply(cvg, as.data.frame)
 		colnames(cvg)[1] <- "iid"
 		cvg$panel <- cvg$iid
 	}
 	
 	## discover splice junctions
 	message("Discovering splice junctions...")
 	sj <- lapply(aln, summarizeJunctions)
 	## ignore splices to out-of-range places
 	sj <- lapply(sj, subsetByOverlaps, zoom, type = "within")
 	## make swoop shapes representing splicing events
 	sj <- ldply(sj, as.data.frame)
 	swoops <- make.swoop(sj, scale = splice.scale)
 	swoops$panel <- swoops$.id
 	
 	## add metadata, if any
 	if (!is.null(meta))
 		cvg <- merge(cvg, meta, all.x = TRUE)
 	
 	## rescale by total coverage, if provided
 	if (!is.null(meta$reads))
 		cvg$depth <- with(cvg, score/(reads/1e7))
 	else
 		cvg$depth <- cvg$score
 	
 	## set colours for coverage panels, if not provided
 	if (is.null(colours)) {
 		colours <- rep("darkgrey", length(aln))
 		names(colours) <- names(aln)
 	}
 	
 	## set x-axis limits to cover region of interest
 	xlims <- range(as.vector(ranges(zoom)))
 	
 	## draw coverage plot with splice events
 	cvg <- subset(cvg, depth > min.coverage)
 	cvg$depth <- pmin(cvg$depth, max.coverage)
 	swoops <- subset(swoops, score > min.splices)
 	
 	message("Rendering plots...")
 	p0 <- ggplot(cvg) +
 		geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = depth, fill = iid)) +
 		geom_line(data = swoops, aes(x = x, y = y, group = row, colour = .id)) +
 		scale_x_continuous(limits = xlims) +
 		scale_fill_manual(values = colours) +
 		scale_colour_manual(values = colours) +
 		guides(fill = FALSE, colour = FALSE) +
 		facet_grid(panel ~ .) +
 		ylab("coverage (reads per 10 million)\n") +
 		theme_gbrowse() + theme(axis.text.x = element_blank(), axis.title.x = element_blank())
 	
 	if (log.coverage)
 		p0 <- p0 + scale_y_continuous("coverage", trans = "log10")
 	
 	## draw transcript ideograms
 	p1 <- ggplot() +
 		geom_tx(tx[ tx$type == "exon" ], at = 0, height = 5, fill = "grey50", colour.by = colour.by, ...) +
 		scale_x_continuous("\nposition (Mb)", limits = xlims, labels = function(x) x/1e6) +
 		scale_fill_manual(values = c("grey50", "black", "blue")) +
 		facet_grid(panel ~ .) +
 		guides(fill = FALSE) +
 		ylab("") +
 		theme_gbrowse() + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
 	
 	## combine plots
 	rez <- gtable:::rbind_gtable( ggplotGrob(p0), ggplotGrob(p1), size = "first" )
 	
 	## done
 	return(rez)
 	
 }

 #' Use Bezier curves to make 'swoop' shapes represnting splice junctions
 make.swoop <- function(df, scale = c(1.5e3, 5), ...) {
 	
 	df$row <- seq_len(nrow(df))
 	ddply(df, .(row), function(d) {
 		w <- with(d, end-start)[1]
 		mid <- c(0.2,0.8)*w+d$start[1]
 		x <- c(d$start[1], mid, d$end[1])
 		y <- c(0, -1*rep(pmax(w/scale[1], scale[2]), 2), 0)
 		rez <- as.data.frame(Hmisc::bezier(x, y))
 		for (c in colnames(d)) {
 			rez[,c] <- d[1,c]
 		}
 		return(rez)
 	})
 	
 }

 #' Render a GRanges of exons into a transcript ideogram
 #' 
 #' @param exons a \code{GRanges} containing exons; metadata column \code{"Parent"} groups them into distinct transcripts
 #' @param at vertical offset for first transcript
 #' @param height vertical size of each transcript
 #' @param arrows if arrows desired along intron connectors (to show strand orientation), this should be a call
 #' 	to \code{grid::arrow()} returning the desired sort of arrow
 #' @param do.introns if \code{TRUE}, draw lines through introns to connect exons
 #' @param colour line colour for intron connectors
 #' @param colour.by name of metadata column to use for exon boxes
 #' @param fill fill colour for exon boxes, if \code{colour.by} not specified
 #' @param stroke line colour for borders of exon boxes
 #' 
 #' @value a list of \code{ggplot2} geoms, which can be added to an existing plot with \code{`+`}.
 #' 
 #' @details Designed for use with the output of \code{rtracklayer::import.gff3()} with GFFs from Ensembl.  For
 #' 	other use cases, mileage may vary.
 #' 
 geom_tx <- function(exons, at = 0, height = 1, arrows = grid::arrow(length = unit(4, "pt"), type = "closed"),
 					do.introns = TRUE, colour.by = NULL, stroke = NA, colour = "black", fill = "black",
 					label = FALSE, label.size = 3, drop = TRUE, ...) {
 	
 	.make.tx.df <- function(ex) {
 		
 		if (!length(ex))
 			return(NULL)
 		
 		if (drop)
 			values(ex)$at <- as.numeric(ex$parent)*height + at
 		else
 			values(ex)$at <- as.numeric(ex$parent)
 		values(ex)$height <- height
 		if (drop)
 			exx <- droplevels(as.data.frame(ex))
 		else
 			exx <- as.data.frame(ex)
 		exx$panel <- "transcripts"
 		if (is.null(colour.by) || is.na(colour.by)) {
 			rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height*0.4, ymax = at+height*0.4),
 								   fill = fill, colour = stroke, ...) )
 		}
 		else {
 			exx$fill <- exx[ ,colour.by ]
 			rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height*0.4, ymax = at+height*0.4, fill = fill),
 								   colour = stroke, ...) )
 		}
 		
 		if (label) {
 			labels <- data.frame(xpos = max(exx$end) + 1000, ypos = exx$at[1]+exx$height[1]*0.4, label = exx$parent[1], panel = exx$panel[1])
 			rez[[3]] <- geom_text(data = labels, aes(x = xpos, y = ypos, label = label), size = label.size, hjust = 0)
 		}
 		
 		if (length(ex) > 1) {
 			## get exons from introns
 			if (min(start(ex)) > 0)
 				.introns <- GenomicRanges::gaps(ex)[-1]
 			else
 				.introns <-  GenomicRanges::gaps(ex)
 			if (drop)
 				introns <- droplevels(as.data.frame(.introns))
 			else
 				introns <- as.data.frame(.introns)
 			#message(nrow(introns), " introns...")
 			
 			if (any(strand(ex) == "-")) {
 				x <- introns$start
 				introns$start <- introns$end
 				introns$end <- x
 			}
 			
 			#print(introns)
 			introns$at <- ex$at[1]
 			introns$height <- height
 			introns$panel <- "transcripts"
 			#if (drop)
 			#	introns$middle <- with(introns, at+height*0.8/2)
 			#else
 			#	introns$middle <- with(introns, at + as.numeric(parent))
 			rez <- c(rez, geom_segment(data = introns, aes(x = start, xend = end, y = at, yend = at),
 									   arrow = arrows, colour = colour, ...))
 			
 		}
 		
 		return(rez)
 		
 	}
 	
 	if (!inherits(exons, "GRanges"))
 		exons <- makeGRangesFromDataFrame(exons, keep.extra.columns = TRUE)
 	
 	## if input was GFF3, do everything on a per-tx basis
 	if (!is.null(exons$Parent)) {
 		
 		if (is.null(exons$parent)) {
 			## rtracklayer::import.gff3() uses list format for Parent field; assume 1 parent, listed first
 			if (inherits(exons$Parent,"List"))
 				exons$parent <- sapply(exons$Parent, "[", 1)
 			else
 				exons$parent <- as.character(exons$parent)
 		}
 		else {
 			## respect existing 'parent' column
 		}
 		## 'parent' needs to be a factor; force this to be so
 		if (!is.factor(exons$parent))
 			exons$parent <- reorder(factor(exons$parent), start(exons), min)
 		
 	}
 	else {
 		exons$parent <- 1
 	}
 	
 	exons <- exons[ order(exons$parent) ]
 	exl <- split(exons, exons$parent)
 	lapply(exl, .make.tx.df)
 	
 }
	# rnaseq.R
	# plots related to RNAseq data

	library(GenomicAlignments)
	library(GenomicRanges)
	library(ggplot2)
	library(grid)
	#library(gtable) # for combining plots vertically

	#' Make a 'sashimi plot': coverage plus splice events
	#'
	#' @param aln a named list of \code{GAlignments} objects
	#' @param tx a \code{GRanges} object, preferably as returned by \code{import.gff3()}
	#' @param meta a dataframe of extra metadata; linked to samples by column \code{"iid"}, and column
	#' \code{"reads"} is used to normalize coverage, if available
	#' @param smooth integer; if >0, size of windows in which to calculate smoothed coverage estimate
	#' @param min.coverage only show regions with coverage strictly greater than this
	#' @param min.splices only show splice events with multiplicity strictly greater than this
	#' @param max.coverage truncate coverage at this value
	#' @param log.coverage logical; if \code{TRUE}, show coverage in log10 scale
	#' @param splice.scale numeric vector of length 2 used for drawing splice-junction arcs; just play with it to find good values
	#' @param colours a named vector of colours used for coverage plots; grey/black is default
	#'
	#' @value a \code{grid} object with the completed plot
	#'
	sashimiplot <- function(aln, tx, meta = NULL, smooth = 0, min.coverage = 0, min.splices = 0, max.coverage = Inf, log.coverage = FALSE,
	splice.scale = c(1.5e3, 5), colours = NULL, colour.by = NULL, ...) {

	if (!inherits(aln[[1]], "GAlignments"))
	stop("Must supply a list of GAlignments objects.")

	## set boundaries of region to plot: approx. the range of all transcripts
	message("Calculating boundaries of region of interest...")
	zoom <- reduce(tx, min.gapwidth = 100e4)

	if (smooth > 0) {
	## calculated smoothed coverage
	message("Smoothing coverage in windows of size ", smooth, " bp...")
	bins <- seq( start(zoom), end(zoom), smooth )
	bins.gr <- GRanges(seqnames = as.vector(seqnames(zoom))[1],
	ranges = IRanges(start = bins[ -length(bins) ], end = bins[-1]-1))
	cvg <- ldply(aln, function(x) {
	transform(as.data.frame(bins.gr), score = countOverlaps(bins.gr, x))
	})
	#cvg <- ldply(bins, as.data.frame)
	colnames(cvg)[1] <- "iid"
	cvg$panel <- cvg$iid
	}
	else {
	## calculate raw coverage
	message("Estimating read coverage...")
	cvg <- lapply(lapply(aln, coverage), as, "GRanges")
	cvg <- ldply(cvg, as.data.frame)
	colnames(cvg)[1] <- "iid"
	cvg$panel <- cvg$iid
	}

	## discover splice junctions
	message("Discovering splice junctions...")
	sj <- lapply(aln, summarizeJunctions)
	## ignore splices to out-of-range places
	sj <- lapply(sj, subsetByOverlaps, zoom, type = "within")
	## make swoop shapes representing splicing events
	sj <- ldply(sj, as.data.frame)
	swoops <- make.swoop(sj, scale = splice.scale)
	swoops$panel <- swoops$.id

	## add metadata, if any
	if (!is.null(meta))
	cvg <- merge(cvg, meta, all.x = TRUE)

	## rescale by total coverage, if provided
	if (!is.null(meta$reads))
	cvg$depth <- with(cvg, score/(reads/1e7))
	else
	cvg$depth <- cvg$score

	## set colours for coverage panels, if not provided
	if (is.null(colours)) {
	colours <- rep("darkgrey", length(aln))
	names(colours) <- names(aln)
	}

	## set x-axis limits to cover region of interest
	xlims <- range(as.vector(ranges(zoom)))

	## draw coverage plot with splice events
	cvg <- subset(cvg, depth > min.coverage)
	cvg$depth <- pmin(cvg$depth, max.coverage)
	swoops <- subset(swoops, score > min.splices)

	message("Rendering plots...")
	p0 <- ggplot(cvg) +
	geom_rect(aes(xmin = start, xmax = end, ymin = 0, ymax = depth, fill = iid)) +
	geom_line(data = swoops, aes(x = x, y = y, group = row, colour = .id)) +
	scale_x_continuous(limits = xlims) +
	scale_fill_manual(values = colours) +
	scale_colour_manual(values = colours) +
	guides(fill = FALSE, colour = FALSE) +
	facet_grid(panel ~ .) +
	ylab("coverage (reads per 10 million)\n") +
	theme_gbrowse() + theme(axis.text.x = element_blank(), axis.title.x = element_blank())

	if (log.coverage)
	p0 <- p0 + scale_y_continuous("coverage", trans = "log10")

	## draw transcript ideograms
	p1 <- ggplot() +
	geom_tx(tx[ tx$type == "exon" ], at = 0, height = 5, fill = "grey50", colour.by = colour.by, ...) +
	scale_x_continuous("\nposition (Mb)", limits = xlims, labels = function(x) x/1e6) +
	scale_fill_manual(values = c("grey50", "black", "blue")) +
	facet_grid(panel ~ .) +
	guides(fill = FALSE) +
	ylab("") +
	theme_gbrowse() + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())

	## combine plots
	rez <- gtable:::rbind_gtable( ggplotGrob(p0), ggplotGrob(p1), size = "first" )

	## done
	return(rez)

	}

	#' Use Bezier curves to make 'swoop' shapes represnting splice junctions
	make.swoop <- function(df, scale = c(1.5e3, 5), ...) {

	df$row <- seq_len(nrow(df))
	ddply(df, .(row), function(d) {
	w <- with(d, end-start)[1]
	mid <- c(0.2,0.8)*w+d$start[1]
	x <- c(d$start[1], mid, d$end[1])
	y <- c(0, -1*rep(pmax(w/scale[1], scale[2]), 2), 0)
	rez <- as.data.frame(Hmisc::bezier(x, y))
	for (c in colnames(d)) {
	rez[,c] <- d[1,c]
	}
	return(rez)
	})

	}

	#' Render a GRanges of exons into a transcript ideogram
	#'
	#' @param exons a \code{GRanges} containing exons; metadata column \code{"Parent"} groups them into distinct transcripts
	#' @param at vertical offset for first transcript
	#' @param height vertical size of each transcript
	#' @param arrows if arrows desired along intron connectors (to show strand orientation), this should be a call
	#' to \code{grid::arrow()} returning the desired sort of arrow
	#' @param do.introns if \code{TRUE}, draw lines through introns to connect exons
	#' @param colour line colour for intron connectors
	#' @param colour.by name of metadata column to use for exon boxes
	#' @param fill fill colour for exon boxes, if \code{colour.by} not specified
	#' @param stroke line colour for borders of exon boxes
	#'
	#' @value a list of \code{ggplot2} geoms, which can be added to an existing plot with \code{`+`}.
	#'
	#' @details Designed for use with the output of \code{rtracklayer::import.gff3()} with GFFs from Ensembl. For
	#' other use cases, mileage may vary.
	#'
	geom_tx <- function(exons, at = 0, height = 1, arrows = grid::arrow(length = unit(4, "pt"), type = "closed"),
	do.introns = TRUE, colour.by = NULL, stroke = NA, colour = "black", fill = "black",
	label = FALSE, label.size = 3, drop = TRUE, ...) {

	.make.tx.df <- function(ex) {

	if (!length(ex))
	return(NULL)

	if (drop)
	values(ex)$at <- as.numeric(ex$parent)*height + at
	else
	values(ex)$at <- as.numeric(ex$parent)
	values(ex)$height <- height
	if (drop)
	exx <- droplevels(as.data.frame(ex))
	else
	exx <- as.data.frame(ex)
	exx$panel <- "transcripts"
	if (is.null(colour.by) \|\| is.na(colour.by)) {
	rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height0.4, ymax = at+height0.4),
	fill = fill, colour = stroke, ...) )
	}
	else {
	exx$fill <- exx[ ,colour.by ]
	rez <- list( geom_rect(data = exx, aes(xmin = start, xmax = end, ymin = at-height0.4, ymax = at+height0.4, fill = fill),
	colour = stroke, ...) )
	}

	if (label) {
	labels <- data.frame(xpos = max(exx$end) + 1000, ypos = exx$at[1]+exx$height[1]*0.4, label = exx$parent[1], panel = exx$panel[1])
	rez[[3]] <- geom_text(data = labels, aes(x = xpos, y = ypos, label = label), size = label.size, hjust = 0)
	}

	if (length(ex) > 1) {
	## get exons from introns
	if (min(start(ex)) > 0)
	.introns <- GenomicRanges::gaps(ex)[-1]
	else
	.introns <- GenomicRanges::gaps(ex)
	if (drop)
	introns <- droplevels(as.data.frame(.introns))
	else
	introns <- as.data.frame(.introns)
	#message(nrow(introns), " introns...")

	if (any(strand(ex) == "-")) {
	x <- introns$start
	introns$start <- introns$end
	introns$end <- x
	}

	#print(introns)
	introns$at <- ex$at[1]
	introns$height <- height
	introns$panel <- "transcripts"
	#if (drop)
	# introns$middle <- with(introns, at+height*0.8/2)
	#else
	# introns$middle <- with(introns, at + as.numeric(parent))
	rez <- c(rez, geom_segment(data = introns, aes(x = start, xend = end, y = at, yend = at),
	arrow = arrows, colour = colour, ...))

	}

	return(rez)

	}

	if (!inherits(exons, "GRanges"))
	exons <- makeGRangesFromDataFrame(exons, keep.extra.columns = TRUE)

	## if input was GFF3, do everything on a per-tx basis
	if (!is.null(exons$Parent)) {

	if (is.null(exons$parent)) {
	## rtracklayer::import.gff3() uses list format for Parent field; assume 1 parent, listed first
	if (inherits(exons$Parent,"List"))
	exons$parent <- sapply(exons$Parent, "[", 1)
	else
	exons$parent <- as.character(exons$parent)
	}
	else {
	## respect existing 'parent' column
	}
	## 'parent' needs to be a factor; force this to be so
	if (!is.factor(exons$parent))
	exons$parent <- reorder(factor(exons$parent), start(exons), min)

	}
	else {
	exons$parent <- 1
	}

	exons <- exons[ order(exons$parent) ]
	exl <- split(exons, exons$parent)
	lapply(exl, .make.tx.df)

	}
No results found