Created
November 14, 2017 17:16
-
-
Save dmckean/11c97c6f49e19f8dbe23b4bdf3463abd to your computer and use it in GitHub Desktop.
Parses output from `samtools stats` into a list of lists and data.frames (good for samtools/htslib 1.4-1.6 at least)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
parse_samstats <- function(file) { | |
max.col <- max(count.fields(file, sep = "\t")) | |
stats.prefixes <- read.table(file, col.names = seq(max.col), | |
colClasses=c("character", rep("NULL", max.col-1)), | |
fill=TRUE) | |
stats.nrows <- table(stats.prefixes)[unique(stats.prefixes)[,1]] | |
stopifnot(unlist( | |
mapply(function(n,x) rep(x, n), stats.nrows, names(stats.nrows)) | |
) == stats.prefixes[,1]) | |
if(inherits(file, "connection")) { | |
file.conn <- file | |
open(file.conn, open="r") | |
} else { | |
file.conn <- file(file, open="r") | |
} | |
stats.list <- lapply(stats.nrows, function(x) { | |
read.table(file.conn, nrows=x, sep="\t", stringsAsFactors = FALSE)[-1] | |
}) | |
close(file.conn) | |
within(stats.list, { | |
# CHK, Checksum [2]Read Names [3]Sequences [4]Qualities | |
# CHK, CRC32 of reads which passed filtering followed by addition (32bit overflow) | |
CHK <- as.list(setNames(CHK, c("read_names", "sequences", "qualities"))) | |
# Summary Numbers: | |
# raw total sequences | |
# filtered sequences | |
# sequences | |
# is sorted | |
# 1st fragments | |
# last fragments | |
# reads mapped | |
# reads mapped and paired | |
# reads unmapped | |
# reads properly paired | |
# reads paired | |
# reads duplicated | |
# reads MQ0 | |
# reads QC failed | |
# non-primary alignments | |
# total length | |
# bases mapped | |
# bases mapped (cigar) | |
# bases trimmed | |
# bases duplicated | |
# mismatches | |
# error rate | |
# average length | |
# maximum length | |
# average quality | |
# insert size average | |
# insert size standard deviation | |
# inward oriented pairs | |
# outward oriented pairs | |
# pairs with other orientation | |
# pairs on different chromosomes | |
SN <- as.list(setNames(SN[,2], | |
strtrim(SN[,1], nchar(SN[,1]) - 1))) | |
# First Fragment Qualitites. | |
# Columns correspond to qualities and rows to cycles. First column is the cycle number. | |
dimnames(FFQ) <- list(FFQ[,1], c("cycle", seq(ncol(FFQ)-1))) | |
# Last Fragment Qualitites. | |
# Columns correspond to qualities and rows to cycles. First column is the cycle number. | |
dimnames(LFQ) <- list(LFQ[,1], c("cycle", seq(ncol(LFQ)-1))) | |
# GC Content of first fragments. | |
names(GCF) <- c("GC", "count") | |
# GC Content of last fragments. | |
names(GCF) <- c("GC", "count") | |
# ACGT content per cycle. | |
# The columns are: cycle; | |
# A,C,G,T base counts as a percentage of all A/C/G/T bases [%]; | |
# and N and O counts as a percentage of all A/C/G/T bases [%] | |
names(GCC) <- c("cycle", "A", "C", "G", "T", "N", "O") | |
# Insert sizes. | |
# The columns are: insert size, pairs total, inward oriented pairs, | |
# outward oriented pairs, other pairs | |
names(IS) <- c("insert_size", "total_pairs", "inward_pairs", "outward_pairs", "other_pairs") | |
# Read lengths | |
# The columns are: read length, count | |
names(RL) <- c("read_length", "count") | |
# Indel distribution | |
# The columns are: length, number of insertions, number of deletions | |
names(ID) <- c("length", "insertion_count", "deletion_count") | |
# Indels per cycle | |
# The columns are: cycle, | |
# number of insertions (fwd) | |
# number of insertions (rev) | |
# number of deletions (fwd) | |
# number of deletions (rev) | |
names(IC) <- c("cycle", "ins_fwd", "ins_rev", "del_fwd", "del_rev") | |
# Coverage distribution | |
names(COV) <- c("coverage_range", "coverage", "bases") | |
# GC-depth | |
# The columns are: GC%, unique sequence percentiles, | |
# 10th, 25th, 50th, 75th and 90th depth percentile | |
names(GCD) <- c("GC", "GC_percentile_for_mapped_sequences", | |
"gc.10", "gc.26", "gc.50", "gc.75", "gc.90") | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment