Skip to content

Instantly share code, notes, and snippets.

@drio
Created May 4, 2010 20:40
Show Gist options
  • Save drio/389944 to your computer and use it in GitHub Desktop.
Save drio/389944 to your computer and use it in GitHub Desktop.
r
R version 2.9.2 (2009-08-24)
Copyright (C) 2009 The R Foundation for Statistical Computing
ISBN 3-900051-07-0
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> ## Arguments
> ## in.file=STRING
> ## values: input file from executing `dqc postalignqc`
> ##
> ## out.type=STRING
> ## values: pdf, png
> ##
> ## space=INT
> ## values: 0 (NT Space), 1 (Color Space)
> ##
> ## expid=STRING
> ## values: experiment name
> ##
> ## Run this with:
> ## R CMD BATCH '--args in.file="" out.type="" space="" expid=""' plot_dqc_preqc.R
> ## where you add in the appropriate parameters
>
> ## TODO
> ## move legends outside the plot area
> ## what about mapping quality vs. alignment score ?
> ## include genome length as a parameter
>
> my.colors=c("green", "blue", "red", "orange", "black");
> my.lwd=2;
> my.cex.main=1.5;
> my.cex.axis=1.5;
> my.cex.lab=1.5
>
> plot.err.dist.by.position <- function(x,
+ y,
+ main,
+ xlab,
+ ylab,
+ paired.end,
+ space,
+ expid)
+ {
+ color=c("green", "blue");
+ legend=c("first end", "second end");
+
+ # plot
+ plot(x=range(x),
+ y=c(0, max(y)+0.5),
+ xaxt = "n",
+ main=paste(expid, "\n", main, sep=""),
+ xlab=xlab,
+ ylab=ylab,
+ cex.main=my.cex.main,
+ cex.axis=my.cex.axis,
+ cex.lab=my.cex.lab,
+ type="n");
+
+ axis(side=1, # x-axis,
+ at=seq(from=min(x), to=max(x), by=5),
+ cex.axis=my.cex.axis,
+ cex.lab=my.cex.lab,
+ label=TRUE);
+
+ if(1==paired.end) {
+ for(i in 1:ncol(y)) {
+ lines(x=x,
+ y=y[,i],
+ type="l",
+ lwd=my.lwd,
+ col=color[i]);
+ }
+ } else {
+ lines(x=x,
+ y=y,
+ type="l",
+ lwd=my.lwd,
+ col=color[1]);
+ }
+
+ # draw horizontal lines
+ for(i in 0:max(y)) {
+ abline(h=i, col="grey");
+ }
+
+ # draw a legend
+ if(1==paired.end) {
+ legend(x="bottomright",
+ legend=legend[1:ncol(y)],
+ col=color[1:ncol(y)],
+ lty = 1:ncol(y),
+ pch = 1:ncol(y),
+ bg='gray');
+ } else {
+ legend(x="bottomright",
+ legend=legend[1],
+ col=color[1],
+ lty = 1,
+ pch = 1,
+ lwd=my.lwd,
+ bg='gray');
+ }
+ }
>
> plot.err.dist.across.reads <- function(x,
+ y,
+ main,
+ xlab,
+ ylab,
+ paired.end,
+ expid)
+ {
+ color=c("green", "blue", "red", "purple");
+ legend=c("first end",
+ "second end",
+ "first end (cumulative)",
+ "second end (cumulative)");
+
+ # plot
+ plot(x=range(x),
+ y=c(0,100),
+ main=paste(expid, "\n", main, sep=""),
+ xlab=xlab,
+ ylab=ylab,
+ cex.main=my.cex.main,
+ cex.axis=my.cex.axis,
+ cex.lab=my.cex.lab,
+ type="n"
+ );
+
+ if(1==paired.end) {
+ for(i in 1:ncol(y)) {
+ # individual
+ lines(x=x,
+ y=y[,i],
+ col=color[i],
+ lwd=my.lwd,
+ type="l");
+ # cumulative
+ y.temp = rep(0, nrow(y));
+ for(j in 1:nrow(y)) {
+ y.temp[j] = sum(as.numeric(y[1:j,i]));
+ }
+
+ lines(x=x,
+ y=y.temp,
+ col=color[i+ncol(y)],
+ lwd=my.lwd,
+ type="l");
+ }
+ } else {
+ # individual
+ lines(x=x,
+ y=y,
+ col=color[1],
+ lwd=my.lwd,
+ type="l");
+ # cumulative
+ y.temp = rep(0, length(y));
+ for(j in 1:length(y)) {
+ y.temp[j] = sum(as.numeric(y[1:j]));
+ }
+
+ lines(x=x,
+ y=y.temp,
+ col=color[3],
+ lwd=my.lwd,
+ type="l");
+ }
+
+ # draw horizontal lines
+ for(i in seq(from=0,to=100,by=5)) {
+ abline(h=i, col="grey");
+ }
+
+ # draw vertical lines
+ for(i in 0:max(x)) {
+ abline(v=i, col="grey");
+ }
+
+ # draw a legend
+ if(1==paired.end) {
+ legend(x=2*max(x)/3,
+ y=60,
+ legend=legend[1:(2*ncol(y))],
+ col=color[1:(2*ncol(y))],
+ lty = 1:(2*ncol(y)),
+ pch = 1:(2*ncol(y)),
+ lwd=my.lwd,
+ bg='gray');
+ } else {
+ legend(x=2*max(x)/3,
+ y=60,
+ legend=legend[c(1,3)],
+ col=color[c(1,3)],
+ lty = 1:2,
+ pch = 1:2,
+ lwd=my.lwd,
+ bg='gray');
+ }
+ }
>
> plot.err.dist <- function(out.file,
+ err.bases.by,
+ err.bases.across,
+ err.colors.by,
+ err.colors.across,
+ expid,
+ space=0)
+ {
+ stopifnot((space == 0) || (space == 1));
+
+ graphics.off();
+
+ if(space == 1) {
+ # 2 rows, 2 columns
+ pdf(out.file, width=16, height=16);
+ par(mfrow=c(2,2),
+ mar=c(5,4,6,2));
+ } else {
+ # 1 row, 2 columns
+ pdf(out.file, width=16, height=8);
+ par(mfrow=c(1,2),
+ mar=c(5,4,6,2));
+ }
+
+
+ my.cols = 2;
+ paired.end = 0;
+ if(ncol(err.bases.by) == 5) {
+ if(0 < sum(as.numeric(err.bases.by[,2]))) {
+ # paired end TODO: support for >2 ends
+ my.cols = c(2, 4);
+ paired.end=1;
+ } else {
+ stopifnot(0 < sum(as.numeric(err.bases.by[,4])));
+ my.cols = 4;
+ }
+ }
+
+ err.bases.by.x = err.bases.by[,1]+1;
+ if(0==paired.end) {
+ err.bases.by.y = 100*err.bases.by[,my.cols]/err.bases.by[,my.cols+1];
+ } else {
+ err.bases.by.y = err.bases.by[,my.cols];
+ err.bases.by.y[,1] = 100*err.bases.by[,2]/err.bases.by[,3];
+ err.bases.by.y[,2] = rev(100*err.bases.by[,4]/err.bases.by[,5]);
+ }
+
+ err.bases.across.x = err.bases.across[,1];
+ if(0==paired.end) {
+ err.bases.across.y = 100*err.bases.across[,my.cols]/sum(as.numeric(err.bases.across[,my.cols]));
+ } else {
+ err.bases.across.y = err.bases.across[,my.cols];
+ err.bases.across.y[,1] = 100*err.bases.across[,2]/sum(as.numeric(err.bases.across[,2]));
+ err.bases.across.y[,2] = 100*err.bases.across[,4]/sum(as.numeric(err.bases.across[,4]));
+ }
+
+ plot.err.dist.by.position(err.bases.by.x,
+ err.bases.by.y,
+ main=expid,
+ xlab="base position in the read",
+ ylab="nt mismatch %",
+ paired.end,
+ space,
+ expid);
+
+ plot.err.dist.across.reads(err.bases.across.x,
+ err.bases.across.y,
+ main=expid,
+ xlab="number of nt mismatchs in a read",
+ ylab="% of reads",
+ paired.end,
+ expid);
+
+ if(space == 1) {
+ err.colors.by.x = err.colors.by[,1]+1;
+ if(0==paired.end) {
+ err.colors.by.y = 100*err.colors.by[,my.cols]/err.colors.by[,my.cols+1];
+ } else {
+ err.colors.by.y = err.colors.by[,my.cols];
+ err.colors.by.y[,1] = 100*err.colors.by[,2]/err.colors.by[,3];
+ err.colors.by.y[,2] = 100*err.colors.by[,4]/err.colors.by[,5];
+ }
+
+ err.colors.across.x = err.colors.across[,1];
+ if(0==paired.end) {
+ err.colors.across.y = 100*err.colors.across[,my.cols]/sum(as.numeric(err.colors.across[,my.cols]));
+ } else {
+ err.colors.across.y = err.colors.across[,my.cols];
+ err.colors.across.y[,1] = 100*err.colors.across[,2]/sum(as.numeric(err.colors.across[,2]));
+ err.colors.across.y[,2] = 100*err.colors.across[,4]/sum(as.numeric(err.colors.across[,4]));
+ }
+
+ plot.err.dist.by.position(err.colors.by.x,
+ err.colors.by.y,
+ main="Color error rate at a specific position in the base",
+ xlab="base position in the read",
+ ylab="color error %",
+ paired.end,
+ space,
+ expid);
+
+ plot.err.dist.across.reads(err.colors.across.x,
+ err.colors.across.y,
+ main="% of reads with a given number of color errors in a read",
+ xlab="number of color errors in a read",
+ ylab="% of reads",
+ paired.end,
+ expid);
+ }
+
+ # End plotting
+ graphics.off();
+ }
>
>
> plot.insert.dist <- function(out.file, insert.dist, expid,
+ freq=TRUE)
+ {
+
+ x=insert.dist;
+ if(0 == sum(as.numeric(x[,3]))) {
+ # Single end
+ return (0);
+ }
+
+ if("pdf" == out.type) {
+ pdf(file=out.file, width=12, height=6);
+ } else {
+ stopifnot(1==0); # TODO
+ }
+
+ from=min(x[,1]);
+ by=abs(x[1,1]-x[1,2])+1;
+ to=max(x[,1]);
+
+ breaks=seq(from=from, to=to, by=by);
+ main=paste(expid, " Paired end insert distribution", sep="");
+ xlab="Paired end insert distance (bp)";
+ if(freq) {
+ ylab="percentage of paired end reads";
+ } else {
+ ylab="# of paired end reads";
+ }
+
+ # assumption:
+ stopifnot(x[,1] == x[,2]);
+
+ y = c(x[x[,1]==0,3], (x[0<x[,1],3] + rev(x[x[,1]<0,3])));
+ x = seq(from=0, by=1, to=max(x[,1]));
+
+ if(freq) {
+ y = y/sum(as.numeric(y));
+ }
+
+ bins = seq(from=from, to=to, by=by);
+ bins[] = 0;
+ i=from;
+ j=1;
+ while(i<=to) {
+ bins[j] = sum(as.numeric(y[i <= x & x <= min(to, i+by-1)]));
+ i = i+by;
+ j = j+1;
+ }
+ x=seq(from=from, to=to, by=by);
+ y=bins;
+
+ plot(x=x,
+ y=y,
+ main=main,
+ xlab=xlab,
+ ylab=ylab,
+ type="h");
+ graphics.off();
+
+
+ }
>
> plot.start.site.dist <- function(out.file, ss.dist, expid,
+ max.start.sites=10, genome.length=3.1*10^9, my.rand.trials=100000)
+ {
+ # open output file
+ if("pdf" == out.type) {
+ pdf(file=out.file, width=12, height=12);
+ } else {
+ stopifnot(1==0); # TODO
+ }
+
+ # number of plots
+ par(xpd=TRUE,
+ mar=c(5,4,4,8));
+
+ total.reads = sum(as.numeric(ss.dist[,2]));
+ mean = sum(as.numeric(ss.dist[,2]*ss.dist[,1]))/total.reads;
+
+ barplot(height=ss.dist[2:(max.start.sites+1),2]/total.reads,
+ names=1:max.start.sites,
+ main=paste(expid, " Start Site Distribution", sep=""),
+ xlab="Number of reads at a given start site",
+ ylab="Fraction of mapped reads",
+ ylim=c(0,1),
+ axes=TRUE);
+
+ # plot theoretical
+ mu = total.reads/genome.length;
+ my.rand = rpois(my.rand.trials, mu) + 1;
+ h=hist(my.rand, plot=FALSE, breaks=-1:max(my.rand) + 0.5);
+ my.counts = h$counts[2:length(h$counts)];
+ if(length(my.counts) < max.start.sites) {
+ my.counts = append(my.counts, rep(0, max.start.sites-length(my.counts)));
+ }
+ points(1:length(my.counts),
+ my.counts/sum(as.numeric(my.counts)),
+ pch=4,
+ cex=4,
+ col="green");
+ lines(1:length(my.counts),
+ my.counts/sum(as.numeric(my.counts)),
+ lty="dashed",
+ lwd=4,
+ col="green");
+
+ # close output file
+ graphics.off();
+ }
>
> plot.mapq.as.dists <- function(out.file, mapq.dist, as.dist, expid)
+ {
+ # open output file
+ if("pdf" == out.type) {
+ pdf(file=out.file, width=12, height=12);
+ } else {
+ stopifnot(1==0); # TODO
+ }
+
+ # number of plots
+ par(mfrow=c(2,1),
+ xpd=TRUE,
+ mar=c(5,4,4,8));
+
+ # HERE
+ print(mapq.dist);
+
+ # mapping quality distribution
+ plot(x=range(mapq.dist[,1]),
+ y=c(0, max(log10(sum(as.numeric(mapq.dist[,2]))))),
+ xlab="Mapping quality",
+ ylab="Number of reads (log10)",
+ main=paste(expid, " Mapping Quality Distribution", sep=""),
+ type="n");
+ points(x=mapq.dist[,1],
+ y=log10(mapq.dist[,2]),
+ pch=4,
+ col=my.colors[1]);
+ y.cuml=mapq.dist[,2];
+ for(i in (length(y.cuml)-1):1) {
+ y.cuml[i] = y.cuml[i+1] + y.cuml[i];
+ }
+ points(x=mapq.dist[,1],
+ y=log10(y.cuml),
+ pch=4,
+ col=my.colors[2]);
+ legend(x=par("usr")[2] + 1,
+ y=max(log10(mapq.dist[,2])),
+ title="distribution",
+ legend=c("point", "cumulative"),
+ lwd=1,
+ col=my.colors[1:2]);
+
+ # alignment score distribution
+ plot(x=range(as.dist[,1]),
+ y=c(0, max(log10(sum(as.numeric(as.dist[,2]))))),
+ xlab="Alignment score",
+ ylab="Number of reads (log10)",
+ main=paste(expid, " Alignment Score Distribution", sep=""),
+ type="n");
+ points(x=as.dist[,1],
+ y=log10(as.dist[,2]),
+ pch=4,
+ col=my.colors[1]);
+ y.cuml=as.dist[,2];
+ for(i in (length(y.cuml)-1):1) {
+ y.cuml[i] = y.cuml[i+1] + y.cuml[i];
+ }
+ points(x=as.dist[,1],
+ y=log10(y.cuml),
+ pch=4,
+ col=my.colors[2]);
+ legend(x=par("usr")[2] + 1,
+ y=max(log10(as.dist[,2])),
+ title="distribution",
+ legend=c("point", "cumulative"),
+ lwd=1,
+ col=my.colors[1:2]);
+
+ # close output file
+ graphics.off();
+ }
>
> plot.dqc.postalignqc <- function(in.file, out.type="pdf", space=0, expid="expid")
+ {
+ ctr.line=0;
+ # read in the number of reads
+ data = scan(file=in.file, what=integer(0), nmax=1, quiet=TRUE);
+ err.n_reads = data[1];
+ ctr.line=ctr.line+1;
+
+ # read in read length
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.rl = data[1];
+ ctr.line=ctr.line+1;
+ # read in counts (base errors)
+ err.bases.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
+ ctr.line=ctr.line+tmp.rl;
+
+ if(1 == space) {
+ # read in read length
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.rl = data[1];
+ ctr.line=ctr.line+1;
+ # read in counts (color errors)
+ err.colors.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
+ ctr.line=ctr.line+tmp.rl;
+ }
+
+ # read in read length
+ # note: we want to skip over the next block (gaps as errors)
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.rl = data[1];
+ ctr.line=ctr.line+1+tmp.rl;
+
+ # read in max base errors across reads
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.be = data[1];
+ ctr.line=ctr.line+1;
+ # read in counts (base errors across reads)
+ err.bases.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.be);
+ ctr.line=ctr.line+tmp.be;
+
+ if(1 == space) {
+ # read in max color errors across reads
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.ce = data[1];
+ ctr.line=ctr.line+1;
+ # read in counts (color errors across reads)
+ err.colors.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.ce);
+ ctr.line=ctr.line+tmp.ce;
+ }
+
+ # read in read length
+ # note: we want to skip over the next block (gaps as errors)
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ tmp.ge = data[1];
+ ctr.line=ctr.line+1+tmp.ge;
+
+ # read in the number of bins
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ n.bins = data[1];
+ ctr.line=ctr.line+1;
+ # read in the bins
+ insert.dist = read.table(file=in.file, skip=ctr.line, nrows=n.bins);
+ ctr.line=ctr.line+n.bins;
+
+ # read in the number of raw reads (pe, se)
+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
+ n.raw.pe = data[1,1];
+ n.raw.se = data[1,2];
+ ctr.line=ctr.line+1;
+ # read in the number of mapped (pe, up, se)
+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
+ n.mapped.pe = data[1,1];
+ n.mapped.up = data[1,2];
+ n.mapped.se = data[1,3];
+ ctr.line=ctr.line+1;
+ # read in the number of mapped pcr duplicates (pe, up, se)
+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
+ n.mapped.pcr.pe = data[1,1];
+ n.mapped.pcr.up = data[1,2];
+ n.mapped.pcr.se = data[1,3];
+ ctr.line=ctr.line+1;
+
+ # read in the mapping quality distribution
+ mapq.dist = read.table(file=in.file, skip=ctr.line, nrows=256);
+ ctr.line=ctr.line+256;
+
+ # read in the number of discrete alignment scores
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=3, quiet=TRUE);
+ n.as = data[1];
+ ctr.line=ctr.line+1;
+ # read in the alignment score distribution
+ as.dist = read.table(file=in.file, skip=ctr.line, nrows=n.as);
+ ctr.line=ctr.line+n.as;
+
+ # read in the number of start sites
+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
+ n.ss = data[1];
+ ctr.line=ctr.line+1;
+ # read in the start site distribution
+ ss.dist = read.table(file=in.file, skip=ctr.line, nrows=n.ss);
+ ctr.line=ctr.line+n.ss;
+
+ plot.mapq.as.dists(paste(in.file, ".mapq.as.dists.pdf", sep=""),
+ mapq.dist, as.dist, expid);
+ plot.start.site.dist(paste(in.file, ".start.site.dist.pdf", sep=""), ss.dist, expid);
+ plot.insert.dist(paste(in.file, ".insert.dist.pdf", sep=""), insert.dist, expid);
+ plot.err.dist(paste(in.file, ".err.dist.pdf", sep=""),
+ err.bases.by, err.bases.across, err.colors.by, err.colors.across, expid, space)
+ }
>
> args=(commandArgs(TRUE));
> if(length(args)==0) {
+ print("No arguments supplied.")
+ } else {
+ out.type="pdf";
+ space=0;
+
+ for(i in 1:length(args)){
+ eval(parse(text=args[[i]]));
+ }
+
+ plot.dqc.postalignqc(in.file, out.type, space, expid);
+ }
Error in read.table(file = in.file, skip = ctr.line, nrows = n.bins) :
invalid 'nlines' argument
Calls: plot.dqc.postalignqc -> read.table
Execution halted
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment