Skip to content

Instantly share code, notes, and snippets.

@mlin
Last active July 26, 2023 06:42
Show Gist options
  • Save mlin/4aa03921f5336f2fa78c to your computer and use it in GitHub Desktop.
Save mlin/4aa03921f5336f2fa78c to your computer and use it in GitHub Desktop.
whole-genome alignment dotplots
require(ggplot2)
require(data.table)
require(compiler)
# First run LASTZ to produce MAF output. Options for matching near-identical assemblies: --notransition --seed=match15 --step=10 --hspthresh=100000 --gapped --ydrop=3400 --gappedthresh=400000 --ambiguous=iupac --allocate:traceback=200M
# Then we postprocess the MAF with: cat alignments.maf | awk 'NF' | tr -s " " | cut -d ' ' -f1-6 | paste - - - | tr " " "\t" | tr "=" "\t" | cut -f 3,5-9,11-15 | gzip -c > alignments.gz
# read and filter hits
fn <- "alignments.gz"
all.hits <- read.delim(gzfile(fn),header=FALSE)
colnames(all.hits) <- c("score","target","tpos","thitlen","tstrand","tlen","query","qpos","qhitlen","qstrand","qlen")
hits <- all.hits[all.hits$thitlen >= 10000,]
# sanity checks
stopifnot(length(unique(hits$tstrand)) == 1)
stopifnot(length(unique(hits$qstrand)) == 2)
quantile(hits$thitlen,probs=seq(0,1,0.1))
quantile(hits$qhitlen,probs=seq(0,1,0.1))
hits$tendpos <- hits$tpos + hits$thitlen
hits$qendpos <- hits$qpos + hits$qhitlen
# make all query positions relative to the query + strand (reverse of MAF for - strand hits)
rhits <- hits$qstrand == '-'
tmp <- hits$qpos[rhits]
hits$qpos[rhits] <- hits$qlen[rhits] - hits$qendpos[rhits]
hits$qendpos[rhits] <- hits$qlen[rhits] - tmp
stopifnot(hits$qendpos >= hits$qpos)
# collect information about the sequences
targets <- unique(data.frame(target=hits$target,length=hits$tlen))
stopifnot(nrow(targets) == length(unique(hits$target)))
row.names(targets) <- targets$target
queries <- unique(data.frame(query=hits$query, length=hits$qlen))
stopifnot(nrow(queries) == length(unique(hits$query)))
row.names(queries) <- queries$query
# order the target sequences: autosomes, X, Y, then other contigs by decreasing length
chromosomes <- c(as.character(1:22),"X","Y")
chromosomes <- as.character(sapply(chromosomes, function(n) paste("chr",n,sep="")))
other.contigs <- setdiff(rownames(targets), chromosomes)
other.contigs <- other.contigs[order(targets[other.contigs,]$length,decreasing=TRUE)]
stopifnot(length(chromosomes) + length(other.contigs) == nrow(targets))
targets$target <- factor(targets$target, levels=c(chromosomes,other.contigs))
targets <- targets[order(targets$target),]
targets <- cbind(targets,data.frame(ordinal=1:nrow(targets)))
# order the query sequences by position along target sequence axis
hits.by.query <- data.table(hits[,c("query","qstrand","target","tpos","thitlen")])
hits.by.query$mscore <- 0 - hits$score
setkey(hits.by.query,"query","mscore")
query.ordinal <- function(q) {
qhits <- hits.by.query[q]
setkey(qhits,"target","mscore")
# find the target with the most coverage by this query
qtargets <- unique(qhits$target)
qcoverage <- sapply(qtargets, function(qtarget) sum(as.numeric(qhits[as.character(qtarget)]$thitlen)))
qtarget <- as.character(qtargets[which.max(qcoverage)])
tofs <- targets[qtarget,'ordinal']*1e10
stopifnot(!is.na(tofs))
# find the target position of the highest-scoring hit
qhits <- qhits[qtarget]
qtpos <- qhits$tpos[1]
stopifnot(!is.na(qtpos))
# formulate an ordinal value for this query sequence to order it on the target,tpos axis
return (tofs + qtpos)
}
queries$ordinal <- as.numeric(sapply(rownames(queries),cmpfun(query.ordinal)))
queries <- queries[order(queries$ordinal),]
# Ordering the target and query contigs has effectively defined two continuous axes.
# Join these with the hits, to facilitate positioning the hits on the axes.
targets$axis_offset <- c(0,cumsum(as.numeric(targets$length))[1:(nrow(targets)-1)])
queries$axis_offset <- c(0,cumsum(as.numeric(queries$length))[1:(nrow(queries)-1)])
hits <- merge(hits,data.table(target=rownames(targets), tofs=targets$axis_offset, key="target"))
hits <- merge(hits,data.table(query=rownames(queries), qofs=queries$axis_offset, key="query"))
# flip orientation of query contigs whose highest-scoring hit is - strand wrt reference
queries$reorient <- as.logical(sapply(rownames(queries), cmpfun(function(query) {
return (hits.by.query[query]$qstrand[1] == '-')
})))
hits <- merge(hits,data.table(query=queries$query,reorient=queries$reorient,key="query"))
tmp <- hits$qpos[hits$reorient]
hits$qpos[hits$reorient] <- hits$qlen[hits$reorient] - hits$qendpos[hits$reorient]
hits$qendpos[hits$reorient] <- hits$qlen[hits$reorient] - tmp
tmp <- hits$qstrand == '-'
hits$qstrand[hits$reorient & tmp] <- '+'
hits$qstrand[hits$reorient & !tmp] <- '-'
# master plotting function
plot.hits <- function(selector=TRUE,target.boundaries=TRUE,query.boundaries=FALSE,xlab="de novo assembly",ylab="GRCh38") {
selected.hits <- hits[selector,]
x <- selected.hits$qofs + ifelse(selected.hits$qstrand == '+', selected.hits$qpos, selected.hits$qendpos)
xend <- selected.hits$qofs + ifelse(selected.hits$qstrand == '+', selected.hits$qendpos, selected.hits$qpos)
y <- selected.hits$tofs + selected.hits$tpos
yend <- selected.hits$tofs + selected.hits$tendpos
contig_orientation <- ifelse(selected.hits$reorient,'-','+')
p <- ggplot(data.frame(x=x,xend=xend,y=y,yend=yend,contig_orientation=contig_orientation),
aes(x=x,xend=xend,y=y,yend=yend,color=contig_orientation)) + geom_segment(size=I(1.75))
x_min <- min(x)
x_max <- max(xend)
p <- p + xlim(x_min,x_max)
y_min <- min(min(y),min(yend))
y_max <- max(max(y),max(yend))
p <- p + ylim(y_min,y_max)
if (target.boundaries) {
p <- p + geom_segment(data=data.frame(contig=targets$target, x=rep(x_min,nrow(targets)), xend=rep(x_max,nrow(targets)),
y=targets$axis_offset, yend=targets$axis_offset),
aes(x=x,xend=xend,y=y,yend=yend), group="contig", linetype="dashed", color="darkgrey",size=I(0.4))
for (chr in c("chr1","chr2","chr3","chrX")) {
p <- p + annotate("text",1e6,targets[chr,]$axis_offset+targets[chr,]$length/2, label=substring(chr,4), size=4)
}
}
if (query.boundaries) {
p <- p + geom_segment(data=data.frame(contig=queries$query, y=rep(y_min,nrow(queries)), yend=rep(y_max,nrow(queries)),
x=queries$axis_offset,xend=queries$axis_offset),
aes(x=x,xend=xend,y=y,yend=yend), group="contig", linetype="dashed", color="darkgrey",size=I(0.4))
}
p + xlab(xlab) + ylab(ylab) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
text = element_text(size=12),
axis.text = element_text(size=12),
legend.position="bottom")
}
plot.hits(xlab="J. Craig Venter - PacBio + FALCON assembly (2015)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment