Skip to content

Instantly share code, notes, and snippets.

@padpadpadpad
Last active May 12, 2020 13:25
Show Gist options
  • Save padpadpadpad/2afa99fe3b58c4f6916ed2edf145b48c to your computer and use it in GitHub Desktop.
Save padpadpadpad/2afa99fe3b58c4f6916ed2edf145b48c to your computer and use it in GitHub Desktop.
Collate samtools flagstats into R
#-----------------------------------------#
# analysis of sequencing mapping stats ####
#-----------------------------------------#
# load packages
library(readr)
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
# write function to process .txt file from samtools flagstats
tidy_samtools_flagstats <- function(file){
step = c('total_reads', 'secondary', 'supplementary', 'duplicates', 'mapped', 'paired_in_sequencing', 'nreads_read1', 'nreads_read2', 'properly_paired', 'with_itself_and_mapped', 'singletons', 'with_mate_mapped_diff_chr', 'with_mate_mapped_to_a_different_chr_mapQ5)')
temp <- readLines(file)
temp <- strsplit(temp, split = ' ')
output <- tibble::tibble(qc_passed_reads = as.numeric(unlist(lapply(temp, '[[', 1))),
qc_failed_reads = as.numeric(unlist(lapply(temp, '[[', 3))),
step = step)
tot_reads <- output$qc_passed_reads[1]
output$prop <- output$qc_passed_reads / tot_reads
return(output)
}
# list all files with flagstats
list_files <- list.files('where/your/flagstats/output/lives', pattern = '.txt', full.names = TRUE)
# create list of files and file size in case of empty files
file_list = tibble(file = list_files,
size = file.size(list_files),
n = 1:length(list_files))
# create dataframe of all flagstats for all files
file_stats <- filter(file_list, size > 0) %>%
nest(., -n) %>%
mutate(., stats = map(data, ~tidy_samtools_flagstats(.x$file))) %>%
unnest(., data) %>%
unnest(., stats) %>%
file = basename(tools::file_path_sans_ext(file))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment