Last active
December 28, 2015 11:09
-
-
Save ericminikel/7490946 to your computer and use it in GitHub Desktop.
A bash + R script to plot the file size of BAMs vs. that of corresponding FASTQs to identify outliers where a BAM appears to have gotten truncated.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd /my/working/dir/ | |
du -sb *.bam > bam.filesize # two-column list of BAMs and their size in bytes | |
cat fastq.12col | awk '{print "du -sb "$1}' | bash > fastq.filesize # two-column list of FASTQs and their size in bytes | |
# switch to R | |
R | |
fastq = read.table('fastq.filesize',header=FALSE) | |
bam = read.table('bam.filesize',header=FALSE) | |
bam$name = substr(bam$V2,1,21) # parse a unique ID for the BAMs from the path | |
fastq$name = substr(fastq$V2,71,91) # same for FASTQs | |
bam$bamsize=bam$V1 | |
fastq$fastqsize=fastq$V1 | |
merged = merge(fastq,bam,"name")[,c("name","bamsize","fastqsize")] | |
dim(merged) # 192 3 | |
# plot bam vs. fastq size without names | |
png('bam.vs.fastq.size.png',width=500,height=500) | |
plot(merged$bamsize,merged$fastqsize,pch=19) | |
dev.off() | |
# and with names to identify any outliers | |
png('bam.vs.fastq.size.with.labels.png',width=500,height=500) | |
plot(merged$bamsize,merged$fastqsize,pch=19) | |
text(merged$bamsize,merged$fastqsize,labels=merged$name,pos=4,cex=.8) | |
dev.off() | |
# write out results for later | |
write.table(merged,"merged.size.txt",sep="\t",col.names=TRUE,row.names=FALSE) | |
# browse a list of potentially bad BAMs | |
badbams = merged$name[merged$fastqsize/merged$bamsize > .41] # .41 is arbitrary cutoff from visual inspection of plots | |
badbams | |
q() | |
n |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment