Parse fastqc outputs

Fastqc is a program to perform some basic quality checks on fastq files. It makes nice html reports for a given file, but (as far as I can tell) doesn't provide a straightfowrard way to compare the results across files (which might represent different library preps, sequencing lanes or samples).

Here is the (really pretty hacky) solution to aggregating these stats that I came up with. This all assumes that you have a directory where reports for each fastq file are in a subdirectories containing the reports with names ./library_name.L001.R1.fastqc/fastqc_data.txt. We will then use regular expressions to match just those parts of the file we care about.

import os
import re

#percent sequences left after de_dup
re_dup = re.compile('Total Deduplicated Percentage\t(\d\d\.\d)')
#total number of sequences
re_nseq = re.compile('Total Sequences\t(\d+)')
#Adapter check and kmer overrepresentation flags (pass/warn/fail in order of severity)
re_adapt= re.compile('Adapter Content\t(\w+)')
re_kmer = re.compile('Kmer Content\t(\w+)')
#Proportion of G,A,T,C bases at end of sequence. (really, what kind of order is that?)
re_gc_end = re.compile('\n145-149\s([0-9\.\s]+)\n')

Usually the filename contains the library/sample etc information. For the Tt data there was a log of matching to do:

re_fcontent = re.compile('Anc-(?P<sample>\d+)-(?P<rep>[AB])_(?P<GE>S\d+)_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc')

Finally here are some functions to read the information in a given text report and extract the interesting bit (this could be cleaned up a lot, but it works):

def gc_skew(content):
    """ """
    gc= [float(x) for x in re_gc_end.findall(content)[1].split()]
    return(str(round(gc[0]/(gc[0] + gc[3]), 2)))

def _get_first_hit(re_match):
    """ """
    return(re_match.groups()[0])

def get_lib_info(fname):
    """ """
    return(re_fcontent.match(fname).groups())


def process_file(fname):
    sample, rep, GE, lane, direction = get_lib_info(fname)
    with open(fname + "/fastqc_data.txt", "r") as infile:
        content = infile.read()
        nseq = _get_first_hit(re_nseq.search(content))
        dup = _get_first_hit(re_dup.search(content))
        gc_end  =gc_skew(content)
        kmer = _get_first_hit(re_kmer.search(content))
        adapter= _get_first_hit(re_adapt.search(content))
    res = [ sample, rep, GE, lane, direction, nseq, dup, gc_end , kmer, adapter ]
    return(res)

And here's teh whole thing as a script (note this looks for files starting Anc-, will need to be edited to work on other data).

import os
import re

re_dup = re.compile('Total Deduplicated Percentage\t(\d\d\.\d)')
re_nseq = re.compile('Total Sequences\t(\d+)')
re_adapt= re.compile('Adapter Content\t(\w+)')
re_kmer = re.compile('Kmer Content\t(\w+)')
re_gc_end = re.compile('\n145-149\s([0-9\.\s]+)\n')
#re_fcontent = re.compile('Anc-(?P<sample>\d+)-B_(?P<ge>S\d+)\_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc/')
re_fcontent = re.compile('Anc-(?P<sample>\d+)-(?P<rep>[AB])_(?P<GE>S\d+)_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc')



def gc_skew(content):
    """ """
    gc= [float(x) for x in re_gc_end.findall(content)[1].split()]
    return(str(round(gc[0]/(gc[0] + gc[3]), 2)))

def _get_first_hit(re_match):
    """ """
    return(re_match.groups()[0])

def get_lib_info(fname):
    """ """
    return(re_fcontent.match(fname).groups())


def process_file(fname):
    sample, rep, GE, lane, direction = get_lib_info(fname)
    with open(fname + "/fastqc_data.txt", "r") as infile:
        content = infile.read()
        nseq = _get_first_hit(re_nseq.search(content))
        dup = _get_first_hit(re_dup.search(content))
        gc_end  =gc_skew(content)
        kmer = _get_first_hit(re_kmer.search(content))
        adapter= _get_first_hit(re_adapt.search(content))
    res = [ sample, rep, GE, lane, direction, nseq, dup, gc_end , kmer, adapter ]
    return(res)

def main():
    """ """
    files = [s for s in os.listdir() if s.startswith("Anc")]
    count = 0
    with open("qc_summary.tsv", "w") as out:
        for fname in files:
            print(fname)
            out.write("\t".join(process_file(fname)) + "\n")
            count += 1
    return count

if __name__ == "__main__":
    main()

dwinter/parse_fq.md