Created
October 25, 2012 16:50
-
-
Save chapmanb/3953983 to your computer and use it in GitHub Desktop.
FastQC summarizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
FastQC checker for Galaxy biomedical data analysis platform | |
@author: Ilya Sytchev | |
Input: one or more files in fastq format | |
Output: sequencing quality report in text format | |
Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/) | |
Partially based on: | |
fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc) | |
rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py) | |
Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8 | |
''' | |
import sys, os, optparse, tempfile, shutil, subprocess | |
def stop_err(msg, returncode=1): | |
sys.stderr.write(msg) | |
sys.exit(returncode) | |
def __main__(): | |
usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]" | |
version = "%prog 1.0.0" | |
op = optparse.OptionParser(usage=usage, version=version) | |
op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program") | |
op.add_option('-o', '--output', dest="outfile", help="location of the output file") | |
(options, infiles) = op.parse_args() | |
# check if location of the FastQC program was provided | |
if options.executable == None: | |
op.error("Missing location of FastQC") | |
# check if FastQC program exists at the provided location | |
if not os.path.isfile(options.executable): | |
op.error("Cannot find FastQC at %s" % options.executable) | |
# check if any input files were provided | |
if infiles == None: | |
op.error("Missing input files") | |
# check if all input files exist | |
for f in infiles: | |
if not os.path.isfile(f): | |
op.error("Cannot find input file %s" % f) | |
# check if output file was provided | |
if options.outfile == None: | |
op.error("Missing output file name") | |
# assemble FastQC command line | |
cmd = [] # list is more secure than string for subprocess call | |
cmd.append(options.executable) | |
tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output | |
cmd.extend(['-o', tmpdir]) | |
cmd.extend(infiles) | |
# prepare files for FastQC stdout and stderr | |
tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name | |
tmp_stderr = open(tmp_stderr_name, 'w') | |
tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name | |
tmp_stdout = open(tmp_stdout_name, 'w') | |
# run FastQC | |
try: | |
subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno()) | |
except subprocess.CalledProcessError as e: | |
stop_err("Error executing FastQC\n", e.returncode) | |
finally: | |
tmp_stderr.close() | |
tmp_stdout.close() | |
outfile = open(options.outfile, 'w') | |
# parse all summary.txt files produced by FastQC and write results into the output file | |
for f in infiles: | |
filename = os.path.basename(f) | |
(datasetname, extension) = os.path.splitext(filename) | |
# Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names | |
# Alternative solution is to iterate over report directories instead of input file names | |
if extension == '.fastq': | |
summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt') | |
else: | |
summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt') | |
outfile.write("%s results:\n" % datasetname) | |
# if summary file exists, process and add results to the output file | |
if os.path.isfile(summaryfilename): | |
summaryfile = open(summaryfilename, 'r') | |
for line in summaryfile: | |
(result, test) = line.split('\t')[:2] | |
outfile.write(result + '\t' + test + '\n') | |
summaryfile.close() | |
else: | |
outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename) | |
outfile.write("\n") | |
outfile.close() | |
# clean up temp dir, put in a try block so we don't fail on stale nfs handles | |
try: | |
if os.path.exists(tmpdir): | |
shutil.rmtree(tmpdir) | |
except: | |
pass | |
if __name__ == '__main__': __main__() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<tool id="fastqc_checker_1" name="FastQC checker" version="1.0.0"> | |
<description>for quality control of high throughput sequence data</description> | |
<command interpreter="python"> | |
fastqc_checker.py -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -o $output $input | |
</command> | |
<inputs> | |
<param format="fastq" name="input" type="data" label="Source files" multiple="true"/> | |
</inputs> | |
<outputs> | |
<data format="txt" name="output"/> | |
</outputs> | |
<help> | |
.. class:: infomark | |
**Purpose** | |
FastQC aims to provide a simple way to do some quality control checks on raw | |
sequence data coming from high throughput sequencing pipelines. | |
It provides a modular set of analyses which you can use to give a quick | |
impression of whether your data has any problems of | |
which you should be aware before doing any further analysis. | |
**FastQC documentation** | |
This is a Galaxy interface to the external package FastQC_. | |
Specific documentation on FastQC can be found on that site. | |
.. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/ | |
----- | |
.. class:: infomark | |
**Inputs and outputs** | |
This wrapper will accept one or more fastq files. | |
The tool produces a single output file that contains summary of all the results, including the following: | |
- Basic Statistics | |
- Per base sequence quality | |
- Per sequence quality scores | |
- Per base sequence content | |
- Per base GC content | |
- Per sequence GC content | |
- Per base N content | |
- Sequence Length Distribution | |
- Sequence Duplication Levels | |
- Overrepresented sequences | |
- Kmer Content | |
</help> | |
</tool> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment