Last active
July 26, 2023 06:42
-
-
Save mlin/4aa03921f5336f2fa78c to your computer and use it in GitHub Desktop.
whole-genome alignment dotplots
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(ggplot2) | |
require(data.table) | |
require(compiler) | |
# First run LASTZ to produce MAF output. Options for matching near-identical assemblies: --notransition --seed=match15 --step=10 --hspthresh=100000 --gapped --ydrop=3400 --gappedthresh=400000 --ambiguous=iupac --allocate:traceback=200M | |
# Then we postprocess the MAF with: cat alignments.maf | awk 'NF' | tr -s " " | cut -d ' ' -f1-6 | paste - - - | tr " " "\t" | tr "=" "\t" | cut -f 3,5-9,11-15 | gzip -c > alignments.gz | |
# read and filter hits | |
fn <- "alignments.gz" | |
all.hits <- read.delim(gzfile(fn),header=FALSE) | |
colnames(all.hits) <- c("score","target","tpos","thitlen","tstrand","tlen","query","qpos","qhitlen","qstrand","qlen") | |
hits <- all.hits[all.hits$thitlen >= 10000,] | |
# sanity checks | |
stopifnot(length(unique(hits$tstrand)) == 1) | |
stopifnot(length(unique(hits$qstrand)) == 2) | |
quantile(hits$thitlen,probs=seq(0,1,0.1)) | |
quantile(hits$qhitlen,probs=seq(0,1,0.1)) | |
hits$tendpos <- hits$tpos + hits$thitlen | |
hits$qendpos <- hits$qpos + hits$qhitlen | |
# make all query positions relative to the query + strand (reverse of MAF for - strand hits) | |
rhits <- hits$qstrand == '-' | |
tmp <- hits$qpos[rhits] | |
hits$qpos[rhits] <- hits$qlen[rhits] - hits$qendpos[rhits] | |
hits$qendpos[rhits] <- hits$qlen[rhits] - tmp | |
stopifnot(hits$qendpos >= hits$qpos) | |
# collect information about the sequences | |
targets <- unique(data.frame(target=hits$target,length=hits$tlen)) | |
stopifnot(nrow(targets) == length(unique(hits$target))) | |
row.names(targets) <- targets$target | |
queries <- unique(data.frame(query=hits$query, length=hits$qlen)) | |
stopifnot(nrow(queries) == length(unique(hits$query))) | |
row.names(queries) <- queries$query | |
# order the target sequences: autosomes, X, Y, then other contigs by decreasing length | |
chromosomes <- c(as.character(1:22),"X","Y") | |
chromosomes <- as.character(sapply(chromosomes, function(n) paste("chr",n,sep=""))) | |
other.contigs <- setdiff(rownames(targets), chromosomes) | |
other.contigs <- other.contigs[order(targets[other.contigs,]$length,decreasing=TRUE)] | |
stopifnot(length(chromosomes) + length(other.contigs) == nrow(targets)) | |
targets$target <- factor(targets$target, levels=c(chromosomes,other.contigs)) | |
targets <- targets[order(targets$target),] | |
targets <- cbind(targets,data.frame(ordinal=1:nrow(targets))) | |
# order the query sequences by position along target sequence axis | |
hits.by.query <- data.table(hits[,c("query","qstrand","target","tpos","thitlen")]) | |
hits.by.query$mscore <- 0 - hits$score | |
setkey(hits.by.query,"query","mscore") | |
query.ordinal <- function(q) { | |
qhits <- hits.by.query[q] | |
setkey(qhits,"target","mscore") | |
# find the target with the most coverage by this query | |
qtargets <- unique(qhits$target) | |
qcoverage <- sapply(qtargets, function(qtarget) sum(as.numeric(qhits[as.character(qtarget)]$thitlen))) | |
qtarget <- as.character(qtargets[which.max(qcoverage)]) | |
tofs <- targets[qtarget,'ordinal']*1e10 | |
stopifnot(!is.na(tofs)) | |
# find the target position of the highest-scoring hit | |
qhits <- qhits[qtarget] | |
qtpos <- qhits$tpos[1] | |
stopifnot(!is.na(qtpos)) | |
# formulate an ordinal value for this query sequence to order it on the target,tpos axis | |
return (tofs + qtpos) | |
} | |
queries$ordinal <- as.numeric(sapply(rownames(queries),cmpfun(query.ordinal))) | |
queries <- queries[order(queries$ordinal),] | |
# Ordering the target and query contigs has effectively defined two continuous axes. | |
# Join these with the hits, to facilitate positioning the hits on the axes. | |
targets$axis_offset <- c(0,cumsum(as.numeric(targets$length))[1:(nrow(targets)-1)]) | |
queries$axis_offset <- c(0,cumsum(as.numeric(queries$length))[1:(nrow(queries)-1)]) | |
hits <- merge(hits,data.table(target=rownames(targets), tofs=targets$axis_offset, key="target")) | |
hits <- merge(hits,data.table(query=rownames(queries), qofs=queries$axis_offset, key="query")) | |
# flip orientation of query contigs whose highest-scoring hit is - strand wrt reference | |
queries$reorient <- as.logical(sapply(rownames(queries), cmpfun(function(query) { | |
return (hits.by.query[query]$qstrand[1] == '-') | |
}))) | |
hits <- merge(hits,data.table(query=queries$query,reorient=queries$reorient,key="query")) | |
tmp <- hits$qpos[hits$reorient] | |
hits$qpos[hits$reorient] <- hits$qlen[hits$reorient] - hits$qendpos[hits$reorient] | |
hits$qendpos[hits$reorient] <- hits$qlen[hits$reorient] - tmp | |
tmp <- hits$qstrand == '-' | |
hits$qstrand[hits$reorient & tmp] <- '+' | |
hits$qstrand[hits$reorient & !tmp] <- '-' | |
# master plotting function | |
plot.hits <- function(selector=TRUE,target.boundaries=TRUE,query.boundaries=FALSE,xlab="de novo assembly",ylab="GRCh38") { | |
selected.hits <- hits[selector,] | |
x <- selected.hits$qofs + ifelse(selected.hits$qstrand == '+', selected.hits$qpos, selected.hits$qendpos) | |
xend <- selected.hits$qofs + ifelse(selected.hits$qstrand == '+', selected.hits$qendpos, selected.hits$qpos) | |
y <- selected.hits$tofs + selected.hits$tpos | |
yend <- selected.hits$tofs + selected.hits$tendpos | |
contig_orientation <- ifelse(selected.hits$reorient,'-','+') | |
p <- ggplot(data.frame(x=x,xend=xend,y=y,yend=yend,contig_orientation=contig_orientation), | |
aes(x=x,xend=xend,y=y,yend=yend,color=contig_orientation)) + geom_segment(size=I(1.75)) | |
x_min <- min(x) | |
x_max <- max(xend) | |
p <- p + xlim(x_min,x_max) | |
y_min <- min(min(y),min(yend)) | |
y_max <- max(max(y),max(yend)) | |
p <- p + ylim(y_min,y_max) | |
if (target.boundaries) { | |
p <- p + geom_segment(data=data.frame(contig=targets$target, x=rep(x_min,nrow(targets)), xend=rep(x_max,nrow(targets)), | |
y=targets$axis_offset, yend=targets$axis_offset), | |
aes(x=x,xend=xend,y=y,yend=yend), group="contig", linetype="dashed", color="darkgrey",size=I(0.4)) | |
for (chr in c("chr1","chr2","chr3","chrX")) { | |
p <- p + annotate("text",1e6,targets[chr,]$axis_offset+targets[chr,]$length/2, label=substring(chr,4), size=4) | |
} | |
} | |
if (query.boundaries) { | |
p <- p + geom_segment(data=data.frame(contig=queries$query, y=rep(y_min,nrow(queries)), yend=rep(y_max,nrow(queries)), | |
x=queries$axis_offset,xend=queries$axis_offset), | |
aes(x=x,xend=xend,y=y,yend=yend), group="contig", linetype="dashed", color="darkgrey",size=I(0.4)) | |
} | |
p + xlab(xlab) + ylab(ylab) + | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
panel.background = element_blank(), axis.line = element_line(colour = "black"), | |
text = element_text(size=12), | |
axis.text = element_text(size=12), | |
legend.position="bottom") | |
} | |
plot.hits(xlab="J. Craig Venter - PacBio + FALCON assembly (2015)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment