chapmanb · October 25, 2012 16:50
diff --git a/fastqc_checker.py b/fastqc_checker.py
 #!/usr/bin/env python

 '''
 FastQC checker for Galaxy biomedical data analysis platform

 @author: Ilya Sytchev

 Input: one or more files in fastq format
 Output: sequencing quality report in text format

 Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/)

 Partially based on:
 fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc)
 rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py)

 Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8
 '''

 import sys, os, optparse, tempfile, shutil, subprocess

 def stop_err(msg, returncode=1):
    sys.stderr.write(msg)
    sys.exit(returncode)

 def __main__():
    usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]"
    version = "%prog 1.0.0"
    op = optparse.OptionParser(usage=usage, version=version)
    op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program")
    op.add_option('-o', '--output', dest="outfile", help="location of the output file")
    (options, infiles) = op.parse_args()

    # check if location of the FastQC program was provided
    if options.executable == None:
        op.error("Missing location of FastQC")

    # check if FastQC program exists at the provided location
    if not os.path.isfile(options.executable):
        op.error("Cannot find FastQC at %s" % options.executable)

    # check if any input files were provided
    if infiles == None:
        op.error("Missing input files")

    # check if all input files exist
    for f in infiles:
        if not os.path.isfile(f):
            op.error("Cannot find input file %s" % f)
    
    # check if output file was provided
    if options.outfile == None:
        op.error("Missing output file name")
    
    # assemble FastQC command line
    cmd = []    # list is more secure than string for subprocess call 
    cmd.append(options.executable)
    tmpdir = tempfile.mkdtemp()    # create temp dir for FastQC output
    cmd.extend(['-o', tmpdir])
    cmd.extend(infiles)

    # prepare files for FastQC stdout and stderr
    tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name
    tmp_stderr = open(tmp_stderr_name, 'w')
    tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name
    tmp_stdout = open(tmp_stdout_name, 'w')
    # run FastQC
    try:
        subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno())
    except subprocess.CalledProcessError as e:
        stop_err("Error executing FastQC\n", e.returncode)
    finally:
        tmp_stderr.close()
        tmp_stdout.close()

    outfile = open(options.outfile, 'w')
    
    # parse all summary.txt files produced by FastQC and write results into the output file
    for f in infiles:
        filename = os.path.basename(f)
        (datasetname, extension) = os.path.splitext(filename)
        # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names
        # Alternative solution is to iterate over report directories instead of input file names
        if extension == '.fastq':
            summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt')
        else:
            summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt')
        outfile.write("%s results:\n" % datasetname)
        # if summary file exists, process and add results to the output file
        if os.path.isfile(summaryfilename):
            summaryfile = open(summaryfilename, 'r')
            for line in summaryfile:
                (result, test) = line.split('\t')[:2]
                outfile.write(result + '\t' + test + '\n') 
            summaryfile.close()
        else:
            outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename)
        outfile.write("\n")
    
    outfile.close()

    # clean up temp dir, put in a try block so we don't fail on stale nfs handles
    try: 
        if os.path.exists(tmpdir):
            shutil.rmtree(tmpdir)
    except:
        pass

 if __name__ == '__main__': __main__()
diff --git a/fastqc_checker.xml b/fastqc_checker.xml
 <tool id="fastqc_checker_1" name="FastQC checker" version="1.0.0">
 	<description>for quality control of high throughput sequence data</description>

 	<command interpreter="python">
 		fastqc_checker.py -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -o $output $input
 	</command>

 	<inputs>
 		<param format="fastq" name="input" type="data" label="Source files" multiple="true"/>
 	</inputs>
 	<outputs>
 		<data format="txt" name="output"/>
 	</outputs>
 	
 	<help>

 .. class:: infomark

 **Purpose**

 FastQC aims to provide a simple way to do some quality control checks on raw
 sequence data coming from high throughput sequencing pipelines. 
 It provides a modular set of analyses which you can use to give a quick
 impression of whether your data has any problems of 
 which you should be aware before doing any further analysis.

 **FastQC documentation**

 This is a Galaxy interface to the external package FastQC_.
 Specific documentation on FastQC can be found on that site.

 .. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/

 -----

 .. class:: infomark

 **Inputs and outputs**

 This wrapper will accept one or more fastq files.

 The tool produces a single output file that contains summary of all the results, including the following:

 - Basic Statistics
 - Per base sequence quality
 - Per sequence quality scores
 - Per base sequence content
 - Per base GC content
 - Per sequence GC content
 - Per base N content
 - Sequence Length Distribution
 - Sequence Duplication Levels
 - Overrepresented sequences
 - Kmer Content

 	</help>
 </tool>
	#!/usr/bin/env python

	'''
	FastQC checker for Galaxy biomedical data analysis platform

	@author: Ilya Sytchev

	Input: one or more files in fastq format
	Output: sequencing quality report in text format

	Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/)

	Partially based on:
	fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc)
	rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py)

	Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8
	'''

	import sys, os, optparse, tempfile, shutil, subprocess

	def stop_err(msg, returncode=1):
	sys.stderr.write(msg)
	sys.exit(returncode)

	def __main__():
	usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]"
	version = "%prog 1.0.0"
	op = optparse.OptionParser(usage=usage, version=version)
	op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program")
	op.add_option('-o', '--output', dest="outfile", help="location of the output file")
	(options, infiles) = op.parse_args()

	# check if location of the FastQC program was provided
	if options.executable == None:
	op.error("Missing location of FastQC")

	# check if FastQC program exists at the provided location
	if not os.path.isfile(options.executable):
	op.error("Cannot find FastQC at %s" % options.executable)

	# check if any input files were provided
	if infiles == None:
	op.error("Missing input files")

	# check if all input files exist
	for f in infiles:
	if not os.path.isfile(f):
	op.error("Cannot find input file %s" % f)

	# check if output file was provided
	if options.outfile == None:
	op.error("Missing output file name")

	# assemble FastQC command line
	cmd = [] # list is more secure than string for subprocess call
	cmd.append(options.executable)
	tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output
	cmd.extend(['-o', tmpdir])
	cmd.extend(infiles)

	# prepare files for FastQC stdout and stderr
	tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name
	tmp_stderr = open(tmp_stderr_name, 'w')
	tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name
	tmp_stdout = open(tmp_stdout_name, 'w')
	# run FastQC
	try:
	subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno())
	except subprocess.CalledProcessError as e:
	stop_err("Error executing FastQC\n", e.returncode)
	finally:
	tmp_stderr.close()
	tmp_stdout.close()

	outfile = open(options.outfile, 'w')

	# parse all summary.txt files produced by FastQC and write results into the output file
	for f in infiles:
	filename = os.path.basename(f)
	(datasetname, extension) = os.path.splitext(filename)
	# Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names
	# Alternative solution is to iterate over report directories instead of input file names
	if extension == '.fastq':
	summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt')
	else:
	summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt')
	outfile.write("%s results:\n" % datasetname)
	# if summary file exists, process and add results to the output file
	if os.path.isfile(summaryfilename):
	summaryfile = open(summaryfilename, 'r')
	for line in summaryfile:
	(result, test) = line.split('\t')[:2]
	outfile.write(result + '\t' + test + '\n')
	summaryfile.close()
	else:
	outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename)
	outfile.write("\n")

	outfile.close()

	# clean up temp dir, put in a try block so we don't fail on stale nfs handles
	try:
	if os.path.exists(tmpdir):
	shutil.rmtree(tmpdir)
	except:
	pass

	if __name__ == '__main__': __main__()
	<tool id="fastqc_checker_1" name="FastQC checker" version="1.0.0">
	<description>for quality control of high throughput sequence data</description>

	<command interpreter="python">
	fastqc_checker.py -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -o $output $input
	</command>

	<inputs>
	<param format="fastq" name="input" type="data" label="Source files" multiple="true"/>
	</inputs>
	<outputs>
	<data format="txt" name="output"/>
	</outputs>

	<help>

	.. class:: infomark

	Purpose

	FastQC aims to provide a simple way to do some quality control checks on raw
	sequence data coming from high throughput sequencing pipelines.
	It provides a modular set of analyses which you can use to give a quick
	impression of whether your data has any problems of
	which you should be aware before doing any further analysis.

	FastQC documentation

	This is a Galaxy interface to the external package FastQC_.
	Specific documentation on FastQC can be found on that site.

	.. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/

	-----

	.. class:: infomark

	Inputs and outputs

	This wrapper will accept one or more fastq files.

	The tool produces a single output file that contains summary of all the results, including the following:

	- Basic Statistics
	- Per base sequence quality
	- Per sequence quality scores
	- Per base sequence content
	- Per base GC content
	- Per sequence GC content
	- Per base N content
	- Sequence Length Distribution
	- Sequence Duplication Levels
	- Overrepresented sequences
	- Kmer Content

	</help>
	</tool>