Fastqc is a program to perform some basic quality checks on fastq files. It makes nice html reports for a given file, but (as far as I can tell) doesn't provide a straightfowrard way to compare the results across files (which might represent different library preps, sequencing lanes or samples).
Here is the (really pretty hacky) solution to aggregating these stats that I came up with. This all assumes that you have a directory where reports for each fastq file are in a subdirectories containing the reports with names ./library_name.L001.R1.fastqc/fastqc_data.txt
. We will then use regular expressions to match just those parts of the file we care about.
import os
import re
#percent sequences left after de_dup
re_dup = re.compile('Total Deduplicated Percentage\t(\d\d\.\d)')
#total number of sequences
re_nseq = re.compile('Total Sequences\t(\d+)')
#Adapter check and kmer overrepresentation flags (pass/warn/fail in order of severity)
re_adapt= re.compile('Adapter Content\t(\w+)')
re_kmer = re.compile('Kmer Content\t(\w+)')
#Proportion of G,A,T,C bases at end of sequence. (really, what kind of order is that?)
re_gc_end = re.compile('\n145-149\s([0-9\.\s]+)\n')
Usually the filename contains the library/sample etc information. For the Tt data there was a log of matching to do:
re_fcontent = re.compile('Anc-(?P<sample>\d+)-(?P<rep>[AB])_(?P<GE>S\d+)_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc')
Finally here are some functions to read the information in a given text report and extract the interesting bit (this could be cleaned up a lot, but it works):
def gc_skew(content):
""" """
gc= [float(x) for x in re_gc_end.findall(content)[1].split()]
return(str(round(gc[0]/(gc[0] + gc[3]), 2)))
def _get_first_hit(re_match):
""" """
return(re_match.groups()[0])
def get_lib_info(fname):
""" """
return(re_fcontent.match(fname).groups())
def process_file(fname):
sample, rep, GE, lane, direction = get_lib_info(fname)
with open(fname + "/fastqc_data.txt", "r") as infile:
content = infile.read()
nseq = _get_first_hit(re_nseq.search(content))
dup = _get_first_hit(re_dup.search(content))
gc_end =gc_skew(content)
kmer = _get_first_hit(re_kmer.search(content))
adapter= _get_first_hit(re_adapt.search(content))
res = [ sample, rep, GE, lane, direction, nseq, dup, gc_end , kmer, adapter ]
return(res)
And here's teh whole thing as a script (note this looks for files starting Anc-
, will need to be edited to work on other data).
import os
import re
re_dup = re.compile('Total Deduplicated Percentage\t(\d\d\.\d)')
re_nseq = re.compile('Total Sequences\t(\d+)')
re_adapt= re.compile('Adapter Content\t(\w+)')
re_kmer = re.compile('Kmer Content\t(\w+)')
re_gc_end = re.compile('\n145-149\s([0-9\.\s]+)\n')
#re_fcontent = re.compile('Anc-(?P<sample>\d+)-B_(?P<ge>S\d+)\_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc/')
re_fcontent = re.compile('Anc-(?P<sample>\d+)-(?P<rep>[AB])_(?P<GE>S\d+)_(?P<lane>L\d\d\d)_(?P<direction>R\d)_fastqc')
def gc_skew(content):
""" """
gc= [float(x) for x in re_gc_end.findall(content)[1].split()]
return(str(round(gc[0]/(gc[0] + gc[3]), 2)))
def _get_first_hit(re_match):
""" """
return(re_match.groups()[0])
def get_lib_info(fname):
""" """
return(re_fcontent.match(fname).groups())
def process_file(fname):
sample, rep, GE, lane, direction = get_lib_info(fname)
with open(fname + "/fastqc_data.txt", "r") as infile:
content = infile.read()
nseq = _get_first_hit(re_nseq.search(content))
dup = _get_first_hit(re_dup.search(content))
gc_end =gc_skew(content)
kmer = _get_first_hit(re_kmer.search(content))
adapter= _get_first_hit(re_adapt.search(content))
res = [ sample, rep, GE, lane, direction, nseq, dup, gc_end , kmer, adapter ]
return(res)
def main():
""" """
files = [s for s in os.listdir() if s.startswith("Anc")]
count = 0
with open("qc_summary.tsv", "w") as out:
for fname in files:
print(fname)
out.write("\t".join(process_file(fname)) + "\n")
count += 1
return count
if __name__ == "__main__":
main()