drio · May 4, 2010 20:40
diff --git a/gistfile1 b/gistfile1

 R version 2.9.2 (2009-08-24)
 Copyright (C) 2009 The R Foundation for Statistical Computing
 ISBN 3-900051-07-0

 R is free software and comes with ABSOLUTELY NO WARRANTY.
 You are welcome to redistribute it under certain conditions.
 Type 'license()' or 'licence()' for distribution details.

 R is a collaborative project with many contributors.
 Type 'contributors()' for more information and
 'citation()' on how to cite R or R packages in publications.

 Type 'demo()' for some demos, 'help()' for on-line help, or
 'help.start()' for an HTML browser interface to help.
 Type 'q()' to quit R.

 > ## Arguments
 > ##   in.file=STRING
 > ##		values: input file from executing `dqc postalignqc`
 > ##		
 > ##	 out.type=STRING
 > ##		values: pdf, png
 > ##
 > ##   space=INT
 > ##      values: 0 (NT Space), 1 (Color Space)
 > ##
 > ##   expid=STRING
 > ##      values: experiment name
 > ##
 > ## Run this with:
 > ##   R CMD BATCH '--args in.file="" out.type="" space="" expid=""' plot_dqc_preqc.R
 > ## where you add in the appropriate parameters
 > 
 > ## TODO
 > ##   move legends outside the plot area
 > ##   what about mapping quality vs. alignment score ?
 > ##   include genome length as a parameter
 > 
 > my.colors=c("green", "blue", "red", "orange", "black");
 > my.lwd=2;
 > my.cex.main=1.5;
 > my.cex.axis=1.5;
 > my.cex.lab=1.5
 > 
 > plot.err.dist.by.position <- function(x,
 + 		y,
 + 		main,
 + 		xlab,
 + 		ylab,
 + 		paired.end,
 + 		space,
 + 		expid)
 + {
 + 	color=c("green", "blue");
 + 	legend=c("first end", "second end");
 + 
 + # plot
 + 	plot(x=range(x),
 + 			y=c(0, max(y)+0.5),
 + 			xaxt = "n",
 + 			main=paste(expid, "\n", main, sep=""),
 + 			xlab=xlab,
 + 			ylab=ylab,
 + 			cex.main=my.cex.main,
 + 			cex.axis=my.cex.axis,
 + 			cex.lab=my.cex.lab,
 + 			type="n");
 + 
 + 	axis(side=1, # x-axis,
 + 			at=seq(from=min(x), to=max(x), by=5),
 + 			cex.axis=my.cex.axis,
 + 			cex.lab=my.cex.lab,
 + 			label=TRUE);
 + 
 + 	if(1==paired.end) {
 + 		for(i in 1:ncol(y)) {
 + 			lines(x=x,
 + 					y=y[,i],
 + 					type="l",
 + 					lwd=my.lwd,
 + 					col=color[i]);
 + 		}
 + 	} else {
 + 		lines(x=x,
 + 				y=y,
 + 				type="l",
 + 				lwd=my.lwd,
 + 				col=color[1]);
 + 	}
 + 
 + # draw horizontal lines
 + 	for(i in 0:max(y)) {
 + 		abline(h=i, col="grey");
 + 	}
 + 
 + # draw a legend
 + 	if(1==paired.end) {
 + 		legend(x="bottomright",
 + 				legend=legend[1:ncol(y)],
 + 				col=color[1:ncol(y)],
 + 				lty = 1:ncol(y),
 + 				pch = 1:ncol(y),
 + 				bg='gray');
 + 	} else {
 + 		legend(x="bottomright",
 + 				legend=legend[1],
 + 				col=color[1],
 + 				lty = 1,
 + 				pch = 1,
 + 				lwd=my.lwd,
 + 				bg='gray');
 + 	}
 + }
 > 
 > plot.err.dist.across.reads <- function(x,
 + 		y,
 + 		main,
 + 		xlab,
 + 		ylab,
 + 		paired.end,
 + 		expid)
 + {
 + 	color=c("green", "blue", "red", "purple");
 + 	legend=c("first end",
 + 			"second end",
 + 			"first end (cumulative)",
 + 			"second end (cumulative)");
 + 
 + # plot
 + 	plot(x=range(x),
 + 			y=c(0,100),
 + 			main=paste(expid, "\n", main, sep=""),
 + 			xlab=xlab,
 + 			ylab=ylab,
 + 			cex.main=my.cex.main,
 + 			cex.axis=my.cex.axis,
 + 			cex.lab=my.cex.lab,
 + 			type="n"
 + 		);
 + 
 + 	if(1==paired.end) {
 + 		for(i in 1:ncol(y)) {
 + # individual
 + 			lines(x=x,
 + 					y=y[,i],
 + 					col=color[i],
 + 					lwd=my.lwd,
 + 					type="l");
 + # cumulative
 + 			y.temp = rep(0, nrow(y));
 + 			for(j in 1:nrow(y)) {
 + 				y.temp[j] = sum(as.numeric(y[1:j,i]));
 + 			}
 + 
 + 			lines(x=x,
 + 					y=y.temp,
 + 					col=color[i+ncol(y)],
 + 					lwd=my.lwd,
 + 					type="l");
 + 		}
 + 	} else {
 + # individual
 + 		lines(x=x,
 + 				y=y,
 + 				col=color[1],
 + 				lwd=my.lwd,
 + 				type="l");
 + # cumulative
 + 		y.temp = rep(0, length(y));
 + 		for(j in 1:length(y)) {
 + 			y.temp[j] = sum(as.numeric(y[1:j]));
 + 		}
 + 
 + 		lines(x=x,
 + 				y=y.temp,
 + 				col=color[3],
 + 				lwd=my.lwd,
 + 				type="l");
 + 	}
 + 
 + # draw horizontal lines
 + 	for(i in seq(from=0,to=100,by=5)) {
 + 		abline(h=i, col="grey");
 + 	}
 + 
 + # draw vertical lines
 + 	for(i in 0:max(x)) {
 + 		abline(v=i, col="grey");
 + 	}
 + 
 + # draw a legend
 + 	if(1==paired.end) {
 + 		legend(x=2*max(x)/3,
 + 				y=60,
 + 				legend=legend[1:(2*ncol(y))],
 + 				col=color[1:(2*ncol(y))],
 + 				lty = 1:(2*ncol(y)),
 + 				pch = 1:(2*ncol(y)),
 + 				lwd=my.lwd,
 + 				bg='gray');
 + 	} else {
 + 		legend(x=2*max(x)/3,
 + 				y=60,
 + 				legend=legend[c(1,3)],
 + 				col=color[c(1,3)],
 + 				lty = 1:2,
 + 				pch = 1:2,
 + 				lwd=my.lwd,
 + 				bg='gray');
 + 	}
 + }
 > 
 > plot.err.dist <- function(out.file,
 + 		err.bases.by,
 + 		err.bases.across,
 + 		err.colors.by,
 + 		err.colors.across,
 + 		expid,
 + 		space=0) 
 + {
 + 	stopifnot((space == 0) || (space == 1));
 + 
 + 	graphics.off();
 + 
 + 	if(space == 1) {
 + # 2 rows, 2 columns
 + 		pdf(out.file, width=16, height=16);
 + 		par(mfrow=c(2,2),
 + 				mar=c(5,4,6,2));
 + 	} else {
 + # 1 row, 2 columns
 + 		pdf(out.file, width=16, height=8);
 + 		par(mfrow=c(1,2),
 + 				mar=c(5,4,6,2));
 + 	}
 + 
 + 	
 + 	my.cols = 2;
 + 	paired.end = 0;
 + 	if(ncol(err.bases.by) == 5) {
 + 		if(0 < sum(as.numeric(err.bases.by[,2]))) { 
 + # paired end TODO: support for >2 ends
 + 			my.cols = c(2, 4);
 + 			paired.end=1;
 + 		} else {
 + 			stopifnot(0 < sum(as.numeric(err.bases.by[,4])));
 + 			my.cols = 4;
 + 		}
 + 	}
 + 
 + 	err.bases.by.x = err.bases.by[,1]+1;
 + 	if(0==paired.end) {
 + 		err.bases.by.y = 100*err.bases.by[,my.cols]/err.bases.by[,my.cols+1];
 + 	} else {
 + 		err.bases.by.y = err.bases.by[,my.cols];
 + 		err.bases.by.y[,1] = 100*err.bases.by[,2]/err.bases.by[,3];
 + 		err.bases.by.y[,2] = rev(100*err.bases.by[,4]/err.bases.by[,5]);
 + 	}
 + 
 + 	err.bases.across.x = err.bases.across[,1];
 + 	if(0==paired.end) {
 + 		err.bases.across.y = 100*err.bases.across[,my.cols]/sum(as.numeric(err.bases.across[,my.cols]));
 + 	} else {
 + 		err.bases.across.y = err.bases.across[,my.cols];
 + 		err.bases.across.y[,1] = 100*err.bases.across[,2]/sum(as.numeric(err.bases.across[,2]));
 + 		err.bases.across.y[,2] = 100*err.bases.across[,4]/sum(as.numeric(err.bases.across[,4]));
 + 	}
 + 
 + 	plot.err.dist.by.position(err.bases.by.x,
 + 			err.bases.by.y,
 + 			main=expid,
 + 			xlab="base position in the read",
 + 			ylab="nt mismatch %",
 + 			paired.end,
 + 			space,
 + 			expid);
 + 	
 + 	plot.err.dist.across.reads(err.bases.across.x,
 + 			err.bases.across.y,
 + 			main=expid,
 + 			xlab="number of nt mismatchs in a read",
 + 			ylab="% of reads",
 + 			paired.end,
 + 			expid);
 + 
 + 	if(space == 1) {
 + 		err.colors.by.x = err.colors.by[,1]+1;
 + 		if(0==paired.end) {
 + 			err.colors.by.y = 100*err.colors.by[,my.cols]/err.colors.by[,my.cols+1];
 + 		} else {
 + 			err.colors.by.y = err.colors.by[,my.cols];
 + 			err.colors.by.y[,1] = 100*err.colors.by[,2]/err.colors.by[,3];
 + 			err.colors.by.y[,2] = 100*err.colors.by[,4]/err.colors.by[,5];
 + 		}
 + 
 + 		err.colors.across.x = err.colors.across[,1];
 + 		if(0==paired.end) {
 + 			err.colors.across.y = 100*err.colors.across[,my.cols]/sum(as.numeric(err.colors.across[,my.cols]));
 + 		} else {
 + 			err.colors.across.y = err.colors.across[,my.cols];
 + 			err.colors.across.y[,1] = 100*err.colors.across[,2]/sum(as.numeric(err.colors.across[,2]));
 + 			err.colors.across.y[,2] = 100*err.colors.across[,4]/sum(as.numeric(err.colors.across[,4]));
 + 		}
 + 
 + 		plot.err.dist.by.position(err.colors.by.x,
 + 				err.colors.by.y,
 + 				main="Color error rate at a specific position in the base",
 + 				xlab="base position in the read",
 + 				ylab="color error %",
 + 				paired.end,
 + 				space,
 + 				expid);
 + 
 + 		plot.err.dist.across.reads(err.colors.across.x,
 + 				err.colors.across.y,
 + 				main="% of reads with a given number of color errors in a read",
 + 				xlab="number of color errors in a read",
 + 				ylab="% of reads",
 + 				paired.end,
 + 				expid);
 + 	}
 + 
 + # End plotting
 + 	graphics.off();
 + }
 > 
 > 
 > plot.insert.dist <- function(out.file, insert.dist, expid,
 + 		freq=TRUE)
 + {
 + 
 + 	x=insert.dist;
 + 	if(0 == sum(as.numeric(x[,3]))) {
 + # Single end
 + 		return (0);
 + 	}
 + 	
 + 	if("pdf" == out.type) {
 + 		pdf(file=out.file, width=12, height=6);
 + 	} else {
 + 		stopifnot(1==0); # TODO
 + 	}
 + 
 + 	from=min(x[,1]);
 + 	by=abs(x[1,1]-x[1,2])+1;
 + 	to=max(x[,1]);
 + 
 + 	breaks=seq(from=from, to=to, by=by);
 + 	main=paste(expid, " Paired end insert distribution", sep="");
 + 	xlab="Paired end insert distance (bp)";
 + 	if(freq) {
 + 		ylab="percentage of paired end reads";
 + 	} else {
 + 		ylab="# of paired end reads";
 + 	}
 + 
 + # assumption:
 + 	stopifnot(x[,1] == x[,2]);
 + 
 + 	y = c(x[x[,1]==0,3], (x[0<x[,1],3] + rev(x[x[,1]<0,3])));
 + 	x = seq(from=0, by=1, to=max(x[,1]));
 + 
 + 	if(freq) {
 + 		y = y/sum(as.numeric(y));
 + 	}
 + 
 + 	bins = seq(from=from, to=to, by=by);
 + 	bins[] = 0;
 + 	i=from;
 + 	j=1;
 + 	while(i<=to) {
 + 		bins[j] = sum(as.numeric(y[i <= x & x <= min(to, i+by-1)]));
 + 		i = i+by;
 + 		j = j+1;
 + 	}
 + 	x=seq(from=from, to=to, by=by);
 + 	y=bins;
 + 
 + 	plot(x=x,
 + 			y=y,
 + 			main=main,
 + 			xlab=xlab,
 + 			ylab=ylab,
 + 			type="h");
 + 	graphics.off();
 + 
 + 
 + }
 > 
 > plot.start.site.dist <- function(out.file, ss.dist, expid, 
 + 		max.start.sites=10, genome.length=3.1*10^9, my.rand.trials=100000)
 + {
 + # open output file
 + 	if("pdf" == out.type) {
 + 		pdf(file=out.file, width=12, height=12);
 + 	} else {
 + 		stopifnot(1==0); # TODO
 + 	}
 + 
 + # number of plots
 + 	par(xpd=TRUE,
 + 			mar=c(5,4,4,8));
 + 
 + 	total.reads = sum(as.numeric(ss.dist[,2]));
 + 	mean = sum(as.numeric(ss.dist[,2]*ss.dist[,1]))/total.reads;
 + 
 + 	barplot(height=ss.dist[2:(max.start.sites+1),2]/total.reads,
 + 			names=1:max.start.sites,
 + 			main=paste(expid, " Start Site Distribution", sep=""),
 + 			xlab="Number of reads at a given start site",
 + 			ylab="Fraction of mapped reads",
 + 			ylim=c(0,1),
 + 			axes=TRUE);
 + 
 + # plot theoretical
 + 	mu = total.reads/genome.length;
 + 	my.rand = rpois(my.rand.trials, mu) + 1;
 + 	h=hist(my.rand, plot=FALSE, breaks=-1:max(my.rand) + 0.5);
 + 	my.counts = h$counts[2:length(h$counts)];
 + 	if(length(my.counts) < max.start.sites) {
 + 		my.counts = append(my.counts, rep(0, max.start.sites-length(my.counts)));
 + 	}
 + 	points(1:length(my.counts),
 + 			my.counts/sum(as.numeric(my.counts)),
 + 			pch=4,
 + 			cex=4,
 + 			col="green");
 + 	lines(1:length(my.counts),
 + 			my.counts/sum(as.numeric(my.counts)),
 + 			lty="dashed",
 + 			lwd=4,
 + 			col="green");
 + 
 + # close output file
 + 	graphics.off();
 + }
 > 
 > plot.mapq.as.dists <- function(out.file, mapq.dist, as.dist, expid)
 + {
 + # open output file
 + 	if("pdf" == out.type) {
 + 		pdf(file=out.file, width=12, height=12);
 + 	} else {
 + 		stopifnot(1==0); # TODO
 + 	}
 + 
 + # number of plots
 + 	par(mfrow=c(2,1),
 + 			xpd=TRUE,
 + 			mar=c(5,4,4,8));
 + 
 + # HERE
 + 	print(mapq.dist);
 + 
 + # mapping quality distribution
 + 	plot(x=range(mapq.dist[,1]),
 + 			y=c(0, max(log10(sum(as.numeric(mapq.dist[,2]))))),
 + 			xlab="Mapping quality",
 + 			ylab="Number of reads (log10)",
 + 			main=paste(expid, " Mapping Quality Distribution", sep=""),
 + 			type="n");
 + 	points(x=mapq.dist[,1],
 + 			y=log10(mapq.dist[,2]),
 + 			pch=4,
 + 			col=my.colors[1]);
 + 	y.cuml=mapq.dist[,2];
 + 	for(i in (length(y.cuml)-1):1) {
 + 		y.cuml[i] = y.cuml[i+1] + y.cuml[i];
 + 	}
 + 	points(x=mapq.dist[,1],
 + 			y=log10(y.cuml),
 + 			pch=4,
 + 			col=my.colors[2]);
 + 	legend(x=par("usr")[2] + 1,
 + 			y=max(log10(mapq.dist[,2])),
 + 			title="distribution",
 + 			legend=c("point", "cumulative"),
 + 			lwd=1,
 + 			col=my.colors[1:2]);
 + 
 + # alignment score distribution
 + 	plot(x=range(as.dist[,1]),
 + 			y=c(0, max(log10(sum(as.numeric(as.dist[,2]))))),
 + 			xlab="Alignment score",
 + 			ylab="Number of reads (log10)",
 + 			main=paste(expid, " Alignment Score Distribution", sep=""),
 + 			type="n");
 + 	points(x=as.dist[,1],
 + 			y=log10(as.dist[,2]),
 + 			pch=4,
 + 			col=my.colors[1]);
 + 	y.cuml=as.dist[,2];
 + 	for(i in (length(y.cuml)-1):1) {
 + 		y.cuml[i] = y.cuml[i+1] + y.cuml[i];
 + 	}
 + 	points(x=as.dist[,1],
 + 			y=log10(y.cuml),
 + 			pch=4,
 + 			col=my.colors[2]);
 + 	legend(x=par("usr")[2] + 1,
 + 			y=max(log10(as.dist[,2])),
 + 			title="distribution",
 + 			legend=c("point", "cumulative"),
 + 			lwd=1,
 + 			col=my.colors[1:2]);
 + 
 + # close output file
 + 	graphics.off();
 + }
 > 
 > plot.dqc.postalignqc <- function(in.file, out.type="pdf", space=0, expid="expid")
 + {
 + 	ctr.line=0;
 + # read in the number of reads
 + 	data = scan(file=in.file, what=integer(0), nmax=1, quiet=TRUE);
 + 	err.n_reads = data[1]; 
 + 	ctr.line=ctr.line+1;
 + 
 + # read in read length
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	tmp.rl = data[1];
 + 	ctr.line=ctr.line+1;
 + # read in counts (base errors)
 + 	err.bases.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
 + 	ctr.line=ctr.line+tmp.rl;
 + 
 + 	if(1 == space) {
 + # read in read length
 + 		data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 		tmp.rl = data[1];
 + 		ctr.line=ctr.line+1;
 + # read in counts (color errors)
 + 		err.colors.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
 + 		ctr.line=ctr.line+tmp.rl;
 + 	}
 + 
 + # read in read length
 + #   note: we want to skip over the next block (gaps as errors)
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	tmp.rl = data[1];
 + 	ctr.line=ctr.line+1+tmp.rl;
 + 
 + # read in max base errors across reads
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	tmp.be = data[1];
 + 	ctr.line=ctr.line+1;
 + # read in counts (base errors across reads)
 + 	err.bases.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.be);
 + 	ctr.line=ctr.line+tmp.be;
 + 
 + 	if(1 == space) {
 + # read in max color errors across reads
 + 		data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 		tmp.ce = data[1];
 + 		ctr.line=ctr.line+1;
 + # read in counts (color errors across reads)
 + 		err.colors.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.ce);
 + 		ctr.line=ctr.line+tmp.ce;
 + 	}
 + 
 + # read in read length
 + #   note: we want to skip over the next block (gaps as errors)
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	tmp.ge = data[1];
 + 	ctr.line=ctr.line+1+tmp.ge;
 + 
 + # read in the number of bins
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	n.bins = data[1];
 + 	ctr.line=ctr.line+1;
 + # read in the bins
 + 	insert.dist = read.table(file=in.file, skip=ctr.line, nrows=n.bins);
 + 	ctr.line=ctr.line+n.bins;
 + 
 + # read in the number of raw reads (pe, se)
 + 	data = read.table(file=in.file, skip=ctr.line, nrows=1);
 + 	n.raw.pe = data[1,1];
 + 	n.raw.se = data[1,2];
 + 	ctr.line=ctr.line+1;
 + # read in the number of mapped (pe, up, se)
 + 	data = read.table(file=in.file, skip=ctr.line, nrows=1);
 + 	n.mapped.pe = data[1,1];
 + 	n.mapped.up = data[1,2];
 + 	n.mapped.se = data[1,3];
 + 	ctr.line=ctr.line+1;
 + # read in the number of mapped pcr duplicates (pe, up, se)
 + 	data = read.table(file=in.file, skip=ctr.line, nrows=1);
 + 	n.mapped.pcr.pe = data[1,1];
 + 	n.mapped.pcr.up = data[1,2];
 + 	n.mapped.pcr.se = data[1,3];
 + 	ctr.line=ctr.line+1;
 + 
 + # read in the mapping quality distribution
 + 	mapq.dist = read.table(file=in.file, skip=ctr.line, nrows=256); 
 + 	ctr.line=ctr.line+256;
 + 
 + # read in the number of discrete alignment scores
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=3, quiet=TRUE);
 + 	n.as = data[1];
 + 	ctr.line=ctr.line+1;
 + # read in the alignment score distribution
 + 	as.dist = read.table(file=in.file, skip=ctr.line, nrows=n.as);
 + 	ctr.line=ctr.line+n.as;
 + 
 + # read in the number of start sites
 + 	data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
 + 	n.ss = data[1];
 + 	ctr.line=ctr.line+1;
 + # read in the start site distribution
 + 	ss.dist = read.table(file=in.file, skip=ctr.line, nrows=n.ss);
 + 	ctr.line=ctr.line+n.ss;
 + 
 + 	plot.mapq.as.dists(paste(in.file, ".mapq.as.dists.pdf", sep=""),
 + 			mapq.dist, as.dist, expid);
 + 	plot.start.site.dist(paste(in.file, ".start.site.dist.pdf", sep=""), ss.dist, expid);
 + 	plot.insert.dist(paste(in.file, ".insert.dist.pdf", sep=""), insert.dist, expid);
 + 	plot.err.dist(paste(in.file, ".err.dist.pdf", sep=""), 
 + 		err.bases.by, err.bases.across, err.colors.by, err.colors.across, expid, space)
 + }
 > 
 > args=(commandArgs(TRUE));
 > if(length(args)==0) {
 + 	print("No arguments supplied.")
 + } else {
 + 	out.type="pdf";
 + 	space=0;
 + 
 + 	for(i in 1:length(args)){
 + 		eval(parse(text=args[[i]]));
 + 	}
 + 
 + 	plot.dqc.postalignqc(in.file, out.type, space, expid);
 + }
 Error in read.table(file = in.file, skip = ctr.line, nrows = n.bins) : 
  invalid 'nlines' argument
 Calls: plot.dqc.postalignqc -> read.table
 Execution halted

	R version 2.9.2 (2009-08-24)
	Copyright (C) 2009 The R Foundation for Statistical Computing
	ISBN 3-900051-07-0

	R is free software and comes with ABSOLUTELY NO WARRANTY.
	You are welcome to redistribute it under certain conditions.
	Type 'license()' or 'licence()' for distribution details.

	R is a collaborative project with many contributors.
	Type 'contributors()' for more information and
	'citation()' on how to cite R or R packages in publications.

	Type 'demo()' for some demos, 'help()' for on-line help, or
	'help.start()' for an HTML browser interface to help.
	Type 'q()' to quit R.

	> ## Arguments
	> ## in.file=STRING
	> ## values: input file from executing `dqc postalignqc`
	> ##
	> ## out.type=STRING
	> ## values: pdf, png
	> ##
	> ## space=INT
	> ## values: 0 (NT Space), 1 (Color Space)
	> ##
	> ## expid=STRING
	> ## values: experiment name
	> ##
	> ## Run this with:
	> ## R CMD BATCH '--args in.file="" out.type="" space="" expid=""' plot_dqc_preqc.R
	> ## where you add in the appropriate parameters
	>
	> ## TODO
	> ## move legends outside the plot area
	> ## what about mapping quality vs. alignment score ?
	> ## include genome length as a parameter
	>
	> my.colors=c("green", "blue", "red", "orange", "black");
	> my.lwd=2;
	> my.cex.main=1.5;
	> my.cex.axis=1.5;
	> my.cex.lab=1.5
	>
	> plot.err.dist.by.position <- function(x,
	+ y,
	+ main,
	+ xlab,
	+ ylab,
	+ paired.end,
	+ space,
	+ expid)
	+ {
	+ color=c("green", "blue");
	+ legend=c("first end", "second end");
	+
	+ # plot
	+ plot(x=range(x),
	+ y=c(0, max(y)+0.5),
	+ xaxt = "n",
	+ main=paste(expid, "\n", main, sep=""),
	+ xlab=xlab,
	+ ylab=ylab,
	+ cex.main=my.cex.main,
	+ cex.axis=my.cex.axis,
	+ cex.lab=my.cex.lab,
	+ type="n");
	+
	+ axis(side=1, # x-axis,
	+ at=seq(from=min(x), to=max(x), by=5),
	+ cex.axis=my.cex.axis,
	+ cex.lab=my.cex.lab,
	+ label=TRUE);
	+
	+ if(1==paired.end) {
	+ for(i in 1:ncol(y)) {
	+ lines(x=x,
	+ y=y[,i],
	+ type="l",
	+ lwd=my.lwd,
	+ col=color[i]);
	+ }
	+ } else {
	+ lines(x=x,
	+ y=y,
	+ type="l",
	+ lwd=my.lwd,
	+ col=color[1]);
	+ }
	+
	+ # draw horizontal lines
	+ for(i in 0:max(y)) {
	+ abline(h=i, col="grey");
	+ }
	+
	+ # draw a legend
	+ if(1==paired.end) {
	+ legend(x="bottomright",
	+ legend=legend[1:ncol(y)],
	+ col=color[1:ncol(y)],
	+ lty = 1:ncol(y),
	+ pch = 1:ncol(y),
	+ bg='gray');
	+ } else {
	+ legend(x="bottomright",
	+ legend=legend[1],
	+ col=color[1],
	+ lty = 1,
	+ pch = 1,
	+ lwd=my.lwd,
	+ bg='gray');
	+ }
	+ }
	>
	> plot.err.dist.across.reads <- function(x,
	+ y,
	+ main,
	+ xlab,
	+ ylab,
	+ paired.end,
	+ expid)
	+ {
	+ color=c("green", "blue", "red", "purple");
	+ legend=c("first end",
	+ "second end",
	+ "first end (cumulative)",
	+ "second end (cumulative)");
	+
	+ # plot
	+ plot(x=range(x),
	+ y=c(0,100),
	+ main=paste(expid, "\n", main, sep=""),
	+ xlab=xlab,
	+ ylab=ylab,
	+ cex.main=my.cex.main,
	+ cex.axis=my.cex.axis,
	+ cex.lab=my.cex.lab,
	+ type="n"
	+ );
	+
	+ if(1==paired.end) {
	+ for(i in 1:ncol(y)) {
	+ # individual
	+ lines(x=x,
	+ y=y[,i],
	+ col=color[i],
	+ lwd=my.lwd,
	+ type="l");
	+ # cumulative
	+ y.temp = rep(0, nrow(y));
	+ for(j in 1:nrow(y)) {
	+ y.temp[j] = sum(as.numeric(y[1:j,i]));
	+ }
	+
	+ lines(x=x,
	+ y=y.temp,
	+ col=color[i+ncol(y)],
	+ lwd=my.lwd,
	+ type="l");
	+ }
	+ } else {
	+ # individual
	+ lines(x=x,
	+ y=y,
	+ col=color[1],
	+ lwd=my.lwd,
	+ type="l");
	+ # cumulative
	+ y.temp = rep(0, length(y));
	+ for(j in 1:length(y)) {
	+ y.temp[j] = sum(as.numeric(y[1:j]));
	+ }
	+
	+ lines(x=x,
	+ y=y.temp,
	+ col=color[3],
	+ lwd=my.lwd,
	+ type="l");
	+ }
	+
	+ # draw horizontal lines
	+ for(i in seq(from=0,to=100,by=5)) {
	+ abline(h=i, col="grey");
	+ }
	+
	+ # draw vertical lines
	+ for(i in 0:max(x)) {
	+ abline(v=i, col="grey");
	+ }
	+
	+ # draw a legend
	+ if(1==paired.end) {
	+ legend(x=2*max(x)/3,
	+ y=60,
	+ legend=legend[1:(2*ncol(y))],
	+ col=color[1:(2*ncol(y))],
	+ lty = 1:(2*ncol(y)),
	+ pch = 1:(2*ncol(y)),
	+ lwd=my.lwd,
	+ bg='gray');
	+ } else {
	+ legend(x=2*max(x)/3,
	+ y=60,
	+ legend=legend[c(1,3)],
	+ col=color[c(1,3)],
	+ lty = 1:2,
	+ pch = 1:2,
	+ lwd=my.lwd,
	+ bg='gray');
	+ }
	+ }
	>
	> plot.err.dist <- function(out.file,
	+ err.bases.by,
	+ err.bases.across,
	+ err.colors.by,
	+ err.colors.across,
	+ expid,
	+ space=0)
	+ {
	+ stopifnot((space == 0) \|\| (space == 1));
	+
	+ graphics.off();
	+
	+ if(space == 1) {
	+ # 2 rows, 2 columns
	+ pdf(out.file, width=16, height=16);
	+ par(mfrow=c(2,2),
	+ mar=c(5,4,6,2));
	+ } else {
	+ # 1 row, 2 columns
	+ pdf(out.file, width=16, height=8);
	+ par(mfrow=c(1,2),
	+ mar=c(5,4,6,2));
	+ }
	+
	+
	+ my.cols = 2;
	+ paired.end = 0;
	+ if(ncol(err.bases.by) == 5) {
	+ if(0 < sum(as.numeric(err.bases.by[,2]))) {
	+ # paired end TODO: support for >2 ends
	+ my.cols = c(2, 4);
	+ paired.end=1;
	+ } else {
	+ stopifnot(0 < sum(as.numeric(err.bases.by[,4])));
	+ my.cols = 4;
	+ }
	+ }
	+
	+ err.bases.by.x = err.bases.by[,1]+1;
	+ if(0==paired.end) {
	+ err.bases.by.y = 100*err.bases.by[,my.cols]/err.bases.by[,my.cols+1];
	+ } else {
	+ err.bases.by.y = err.bases.by[,my.cols];
	+ err.bases.by.y[,1] = 100*err.bases.by[,2]/err.bases.by[,3];
	+ err.bases.by.y[,2] = rev(100*err.bases.by[,4]/err.bases.by[,5]);
	+ }
	+
	+ err.bases.across.x = err.bases.across[,1];
	+ if(0==paired.end) {
	+ err.bases.across.y = 100*err.bases.across[,my.cols]/sum(as.numeric(err.bases.across[,my.cols]));
	+ } else {
	+ err.bases.across.y = err.bases.across[,my.cols];
	+ err.bases.across.y[,1] = 100*err.bases.across[,2]/sum(as.numeric(err.bases.across[,2]));
	+ err.bases.across.y[,2] = 100*err.bases.across[,4]/sum(as.numeric(err.bases.across[,4]));
	+ }
	+
	+ plot.err.dist.by.position(err.bases.by.x,
	+ err.bases.by.y,
	+ main=expid,
	+ xlab="base position in the read",
	+ ylab="nt mismatch %",
	+ paired.end,
	+ space,
	+ expid);
	+
	+ plot.err.dist.across.reads(err.bases.across.x,
	+ err.bases.across.y,
	+ main=expid,
	+ xlab="number of nt mismatchs in a read",
	+ ylab="% of reads",
	+ paired.end,
	+ expid);
	+
	+ if(space == 1) {
	+ err.colors.by.x = err.colors.by[,1]+1;
	+ if(0==paired.end) {
	+ err.colors.by.y = 100*err.colors.by[,my.cols]/err.colors.by[,my.cols+1];
	+ } else {
	+ err.colors.by.y = err.colors.by[,my.cols];
	+ err.colors.by.y[,1] = 100*err.colors.by[,2]/err.colors.by[,3];
	+ err.colors.by.y[,2] = 100*err.colors.by[,4]/err.colors.by[,5];
	+ }
	+
	+ err.colors.across.x = err.colors.across[,1];
	+ if(0==paired.end) {
	+ err.colors.across.y = 100*err.colors.across[,my.cols]/sum(as.numeric(err.colors.across[,my.cols]));
	+ } else {
	+ err.colors.across.y = err.colors.across[,my.cols];
	+ err.colors.across.y[,1] = 100*err.colors.across[,2]/sum(as.numeric(err.colors.across[,2]));
	+ err.colors.across.y[,2] = 100*err.colors.across[,4]/sum(as.numeric(err.colors.across[,4]));
	+ }
	+
	+ plot.err.dist.by.position(err.colors.by.x,
	+ err.colors.by.y,
	+ main="Color error rate at a specific position in the base",
	+ xlab="base position in the read",
	+ ylab="color error %",
	+ paired.end,
	+ space,
	+ expid);
	+
	+ plot.err.dist.across.reads(err.colors.across.x,
	+ err.colors.across.y,
	+ main="% of reads with a given number of color errors in a read",
	+ xlab="number of color errors in a read",
	+ ylab="% of reads",
	+ paired.end,
	+ expid);
	+ }
	+
	+ # End plotting
	+ graphics.off();
	+ }
	>
	>
	> plot.insert.dist <- function(out.file, insert.dist, expid,
	+ freq=TRUE)
	+ {
	+
	+ x=insert.dist;
	+ if(0 == sum(as.numeric(x[,3]))) {
	+ # Single end
	+ return (0);
	+ }
	+
	+ if("pdf" == out.type) {
	+ pdf(file=out.file, width=12, height=6);
	+ } else {
	+ stopifnot(1==0); # TODO
	+ }
	+
	+ from=min(x[,1]);
	+ by=abs(x[1,1]-x[1,2])+1;
	+ to=max(x[,1]);
	+
	+ breaks=seq(from=from, to=to, by=by);
	+ main=paste(expid, " Paired end insert distribution", sep="");
	+ xlab="Paired end insert distance (bp)";
	+ if(freq) {
	+ ylab="percentage of paired end reads";
	+ } else {
	+ ylab="# of paired end reads";
	+ }
	+
	+ # assumption:
	+ stopifnot(x[,1] == x[,2]);
	+
	+ y = c(x[x[,1]==0,3], (x[0<x[,1],3] + rev(x[x[,1]<0,3])));
	+ x = seq(from=0, by=1, to=max(x[,1]));
	+
	+ if(freq) {
	+ y = y/sum(as.numeric(y));
	+ }
	+
	+ bins = seq(from=from, to=to, by=by);
	+ bins[] = 0;
	+ i=from;
	+ j=1;
	+ while(i<=to) {
	+ bins[j] = sum(as.numeric(y[i <= x & x <= min(to, i+by-1)]));
	+ i = i+by;
	+ j = j+1;
	+ }
	+ x=seq(from=from, to=to, by=by);
	+ y=bins;
	+
	+ plot(x=x,
	+ y=y,
	+ main=main,
	+ xlab=xlab,
	+ ylab=ylab,
	+ type="h");
	+ graphics.off();
	+
	+
	+ }
	>
	> plot.start.site.dist <- function(out.file, ss.dist, expid,
	+ max.start.sites=10, genome.length=3.1*10^9, my.rand.trials=100000)
	+ {
	+ # open output file
	+ if("pdf" == out.type) {
	+ pdf(file=out.file, width=12, height=12);
	+ } else {
	+ stopifnot(1==0); # TODO
	+ }
	+
	+ # number of plots
	+ par(xpd=TRUE,
	+ mar=c(5,4,4,8));
	+
	+ total.reads = sum(as.numeric(ss.dist[,2]));
	+ mean = sum(as.numeric(ss.dist[,2]*ss.dist[,1]))/total.reads;
	+
	+ barplot(height=ss.dist[2:(max.start.sites+1),2]/total.reads,
	+ names=1:max.start.sites,
	+ main=paste(expid, " Start Site Distribution", sep=""),
	+ xlab="Number of reads at a given start site",
	+ ylab="Fraction of mapped reads",
	+ ylim=c(0,1),
	+ axes=TRUE);
	+
	+ # plot theoretical
	+ mu = total.reads/genome.length;
	+ my.rand = rpois(my.rand.trials, mu) + 1;
	+ h=hist(my.rand, plot=FALSE, breaks=-1:max(my.rand) + 0.5);
	+ my.counts = h$counts[2:length(h$counts)];
	+ if(length(my.counts) < max.start.sites) {
	+ my.counts = append(my.counts, rep(0, max.start.sites-length(my.counts)));
	+ }
	+ points(1:length(my.counts),
	+ my.counts/sum(as.numeric(my.counts)),
	+ pch=4,
	+ cex=4,
	+ col="green");
	+ lines(1:length(my.counts),
	+ my.counts/sum(as.numeric(my.counts)),
	+ lty="dashed",
	+ lwd=4,
	+ col="green");
	+
	+ # close output file
	+ graphics.off();
	+ }
	>
	> plot.mapq.as.dists <- function(out.file, mapq.dist, as.dist, expid)
	+ {
	+ # open output file
	+ if("pdf" == out.type) {
	+ pdf(file=out.file, width=12, height=12);
	+ } else {
	+ stopifnot(1==0); # TODO
	+ }
	+
	+ # number of plots
	+ par(mfrow=c(2,1),
	+ xpd=TRUE,
	+ mar=c(5,4,4,8));
	+
	+ # HERE
	+ print(mapq.dist);
	+
	+ # mapping quality distribution
	+ plot(x=range(mapq.dist[,1]),
	+ y=c(0, max(log10(sum(as.numeric(mapq.dist[,2]))))),
	+ xlab="Mapping quality",
	+ ylab="Number of reads (log10)",
	+ main=paste(expid, " Mapping Quality Distribution", sep=""),
	+ type="n");
	+ points(x=mapq.dist[,1],
	+ y=log10(mapq.dist[,2]),
	+ pch=4,
	+ col=my.colors[1]);
	+ y.cuml=mapq.dist[,2];
	+ for(i in (length(y.cuml)-1):1) {
	+ y.cuml[i] = y.cuml[i+1] + y.cuml[i];
	+ }
	+ points(x=mapq.dist[,1],
	+ y=log10(y.cuml),
	+ pch=4,
	+ col=my.colors[2]);
	+ legend(x=par("usr")[2] + 1,
	+ y=max(log10(mapq.dist[,2])),
	+ title="distribution",
	+ legend=c("point", "cumulative"),
	+ lwd=1,
	+ col=my.colors[1:2]);
	+
	+ # alignment score distribution
	+ plot(x=range(as.dist[,1]),
	+ y=c(0, max(log10(sum(as.numeric(as.dist[,2]))))),
	+ xlab="Alignment score",
	+ ylab="Number of reads (log10)",
	+ main=paste(expid, " Alignment Score Distribution", sep=""),
	+ type="n");
	+ points(x=as.dist[,1],
	+ y=log10(as.dist[,2]),
	+ pch=4,
	+ col=my.colors[1]);
	+ y.cuml=as.dist[,2];
	+ for(i in (length(y.cuml)-1):1) {
	+ y.cuml[i] = y.cuml[i+1] + y.cuml[i];
	+ }
	+ points(x=as.dist[,1],
	+ y=log10(y.cuml),
	+ pch=4,
	+ col=my.colors[2]);
	+ legend(x=par("usr")[2] + 1,
	+ y=max(log10(as.dist[,2])),
	+ title="distribution",
	+ legend=c("point", "cumulative"),
	+ lwd=1,
	+ col=my.colors[1:2]);
	+
	+ # close output file
	+ graphics.off();
	+ }
	>
	> plot.dqc.postalignqc <- function(in.file, out.type="pdf", space=0, expid="expid")
	+ {
	+ ctr.line=0;
	+ # read in the number of reads
	+ data = scan(file=in.file, what=integer(0), nmax=1, quiet=TRUE);
	+ err.n_reads = data[1];
	+ ctr.line=ctr.line+1;
	+
	+ # read in read length
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.rl = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in counts (base errors)
	+ err.bases.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
	+ ctr.line=ctr.line+tmp.rl;
	+
	+ if(1 == space) {
	+ # read in read length
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.rl = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in counts (color errors)
	+ err.colors.by = read.table(file=in.file, skip=ctr.line, nrows=tmp.rl);
	+ ctr.line=ctr.line+tmp.rl;
	+ }
	+
	+ # read in read length
	+ # note: we want to skip over the next block (gaps as errors)
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.rl = data[1];
	+ ctr.line=ctr.line+1+tmp.rl;
	+
	+ # read in max base errors across reads
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.be = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in counts (base errors across reads)
	+ err.bases.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.be);
	+ ctr.line=ctr.line+tmp.be;
	+
	+ if(1 == space) {
	+ # read in max color errors across reads
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.ce = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in counts (color errors across reads)
	+ err.colors.across = read.table(file=in.file, skip=ctr.line, nrows=tmp.ce);
	+ ctr.line=ctr.line+tmp.ce;
	+ }
	+
	+ # read in read length
	+ # note: we want to skip over the next block (gaps as errors)
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ tmp.ge = data[1];
	+ ctr.line=ctr.line+1+tmp.ge;
	+
	+ # read in the number of bins
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ n.bins = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in the bins
	+ insert.dist = read.table(file=in.file, skip=ctr.line, nrows=n.bins);
	+ ctr.line=ctr.line+n.bins;
	+
	+ # read in the number of raw reads (pe, se)
	+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
	+ n.raw.pe = data[1,1];
	+ n.raw.se = data[1,2];
	+ ctr.line=ctr.line+1;
	+ # read in the number of mapped (pe, up, se)
	+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
	+ n.mapped.pe = data[1,1];
	+ n.mapped.up = data[1,2];
	+ n.mapped.se = data[1,3];
	+ ctr.line=ctr.line+1;
	+ # read in the number of mapped pcr duplicates (pe, up, se)
	+ data = read.table(file=in.file, skip=ctr.line, nrows=1);
	+ n.mapped.pcr.pe = data[1,1];
	+ n.mapped.pcr.up = data[1,2];
	+ n.mapped.pcr.se = data[1,3];
	+ ctr.line=ctr.line+1;
	+
	+ # read in the mapping quality distribution
	+ mapq.dist = read.table(file=in.file, skip=ctr.line, nrows=256);
	+ ctr.line=ctr.line+256;
	+
	+ # read in the number of discrete alignment scores
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=3, quiet=TRUE);
	+ n.as = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in the alignment score distribution
	+ as.dist = read.table(file=in.file, skip=ctr.line, nrows=n.as);
	+ ctr.line=ctr.line+n.as;
	+
	+ # read in the number of start sites
	+ data = scan(file=in.file, what=integer(0), skip=ctr.line, nmax=1, quiet=TRUE);
	+ n.ss = data[1];
	+ ctr.line=ctr.line+1;
	+ # read in the start site distribution
	+ ss.dist = read.table(file=in.file, skip=ctr.line, nrows=n.ss);
	+ ctr.line=ctr.line+n.ss;
	+
	+ plot.mapq.as.dists(paste(in.file, ".mapq.as.dists.pdf", sep=""),
	+ mapq.dist, as.dist, expid);
	+ plot.start.site.dist(paste(in.file, ".start.site.dist.pdf", sep=""), ss.dist, expid);
	+ plot.insert.dist(paste(in.file, ".insert.dist.pdf", sep=""), insert.dist, expid);
	+ plot.err.dist(paste(in.file, ".err.dist.pdf", sep=""),
	+ err.bases.by, err.bases.across, err.colors.by, err.colors.across, expid, space)
	+ }
	>
	> args=(commandArgs(TRUE));
	> if(length(args)==0) {
	+ print("No arguments supplied.")
	+ } else {
	+ out.type="pdf";
	+ space=0;
	+
	+ for(i in 1:length(args)){
	+ eval(parse(text=args[[i]]));
	+ }
	+
	+ plot.dqc.postalignqc(in.file, out.type, space, expid);
	+ }
	Error in read.table(file = in.file, skip = ctr.line, nrows = n.bins) :
	invalid 'nlines' argument
	Calls: plot.dqc.postalignqc -> read.table
	Execution halted