aaronsaunders · October 27, 2013 20:25
diff --git a/biofileformat.py b/biofileformat.py
 """
 Mappings from file extensions to biopython types.

 Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/

 """
 import argparse
 import contextlib
 import copy
 import functools
 import os
 import os.path
 import signal
 import sys
 import tempfile

 import bz2
 import gzip
 import os.path
 import sys

 # Define mappings in a dictionary with extension : BioPython_file_type.
 EXTENSION_TO_TYPE = {'.aln': 'clustal',
                     '.afa': 'fasta',
                     '.fa': 'fasta',
                     '.faa': 'fasta',
                     '.fas': 'fasta',
                     '.fasta': 'fasta',
                     '.fastq': 'fastq',
                     '.fq': 'fastq',
                     '.ffn': 'fasta',
                     '.fna': 'fasta',
                     '.frn': 'fasta',
                     '.gb': 'genbank',
                     '.gbk': 'genbank',
                     '.needle': 'emboss',
                     '.nex': 'nexus',
                     '.phy': 'phylip',
                     '.phylip': 'phylip',
                     '.phyx': 'phylip-relaxed',
                     '.qual': 'qual',
                     '.sff': 'sff-trim',
                     '.sth': 'stockholm',
                     '.sto': 'stockholm',}

 COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File}


 class UnknownExtensionError(ValueError):
    pass


 def from_extension(extension):
    """
    Look up the BioPython file type corresponding with input extension.

    Look up is case insensitive.
    """
    if not extension.startswith('.'):
        raise ValueError("Extensions must begin with a period.")
    try:
        return EXTENSION_TO_TYPE[extension.lower()]
    except KeyError:
        raise UnknownExtensionError("seqmagick does not know how to handle " +
                "files with extensions like this: " + extension)


 def from_filename(file_name):
    """
    Look up the BioPython file type corresponding to an input file name.
    """
    base, extension = os.path.splitext(file_name)
    if extension in COMPRESS_EXT:
        # Compressed file
        extension = os.path.splitext(base)[1]
    return from_extension(extension)

 def from_handle(fh, stream_default='fasta'):
    """
    Look up the BioPython file type corresponding to a file-like object.

    For stdin, stdout, and stderr, ``stream_default`` is used.
    """
    if fh in (sys.stdin, sys.stdout, sys.stderr):
        return stream_default
    return from_filename(fh.name)


 class FileType(object):
    """
    Near clone of argparse.FileType, supporting gzip and bzip
    """
    def __init__(self, mode='r'):
        self.mode = mode
        self.ext_map = COMPRESS_EXT.copy()

    def _get_handle(self, file_path):
        ext = os.path.splitext(file_path)[1].lower()
        return self.ext_map.get(ext, open)(file_path, self.mode)

    def __call__(self, string):
        if string == '-':
            if 'r' in self.mode:
                return sys.stdin
            elif 'w' in self.mode:
                return sys.stdout
            else:
                raise ValueError("Invalid mode: {0}".format(string))
        else:
            return self._get_handle(string)
diff --git a/plot_lengths.py b/plot_lengths.py
 from Bio.SeqIO import parse
 from toolbox import biofileformat
 import matplotlib.pyplot as plt
 import sys
 from collections import Counter
 import fileinput
   
 def get_lengths(fname, sub=False):
    """
    Parses a sequence file and returns a list of sequence lengths
    """
    with biofileformat.FileType('rb')(fname) as fh:
        seqformat = biofileformat.from_handle(fh)
          
        okformats = [ "fasta", "fastq" ]
        if seqformat not in okformats:
            print "takes only fasta/fastq w/wo compression"
            return
        
        if sub:
            lengths = []
            for n, rec in enumerate(parse(fh, seqformat)):
                sizes.append(len(rec))
                if n == (sub - 1):
                    break
        else:
            lengths = [len(rec) for rec in parse(fh, seqformat)]
    
    return lengths

 def draw_histogram(lengths):
    """
    Parses a sequence file and returns a plot of sequence lengths.
    Optional argument to subset the file.
    
    """
    plt.hist(lengths)
    plt.title("%s\n%i sequences, range %i to %i bp" \
                % (fname, len(lengths), min(lengths),max(lengths)))
    plt.xlabel("Sequence length (bp)")
    plt.ylabel("Count")
    plt.show()

    return
    
    
 def plot_seqlength(fname, sub=False):
    draw_histogram(get_lengths(fname, sub=False))
    return
   
 def main(args):
    usage = "Usage: plot_lengths.py infile"
    
    if args[0] == "-h":
        print usage
        sys.exit()
    
    for fname in args:
        lengths = get_lengths(fname)
        counts = Counter()
        for l in lengths:
            counts[l] += 1
        outlines = [ '{}\t{}'.format( key, counts[key]) 
                        for key in sorted(counts.keys()) ]
        
        sys.stdout.write('length\tcount\n')
        sys.stdout.write('\n'.join(outlines))
        sys.stdout.write('\n')

 if __name__ == '__main__':
    main(sys.argv[1:])
	"""
	Mappings from file extensions to biopython types.

	Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/

	"""
	import argparse
	import contextlib
	import copy
	import functools
	import os
	import os.path
	import signal
	import sys
	import tempfile

	import bz2
	import gzip
	import os.path
	import sys

	# Define mappings in a dictionary with extension : BioPython_file_type.
	EXTENSION_TO_TYPE = {'.aln': 'clustal',
	'.afa': 'fasta',
	'.fa': 'fasta',
	'.faa': 'fasta',
	'.fas': 'fasta',
	'.fasta': 'fasta',
	'.fastq': 'fastq',
	'.fq': 'fastq',
	'.ffn': 'fasta',
	'.fna': 'fasta',
	'.frn': 'fasta',
	'.gb': 'genbank',
	'.gbk': 'genbank',
	'.needle': 'emboss',
	'.nex': 'nexus',
	'.phy': 'phylip',
	'.phylip': 'phylip',
	'.phyx': 'phylip-relaxed',
	'.qual': 'qual',
	'.sff': 'sff-trim',
	'.sth': 'stockholm',
	'.sto': 'stockholm',}

	COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File}


	class UnknownExtensionError(ValueError):
	pass


	def from_extension(extension):
	"""
	Look up the BioPython file type corresponding with input extension.

	Look up is case insensitive.
	"""
	if not extension.startswith('.'):
	raise ValueError("Extensions must begin with a period.")
	try:
	return EXTENSION_TO_TYPE[extension.lower()]
	except KeyError:
	raise UnknownExtensionError("seqmagick does not know how to handle " +
	"files with extensions like this: " + extension)


	def from_filename(file_name):
	"""
	Look up the BioPython file type corresponding to an input file name.
	"""
	base, extension = os.path.splitext(file_name)
	if extension in COMPRESS_EXT:
	# Compressed file
	extension = os.path.splitext(base)[1]
	return from_extension(extension)

	def from_handle(fh, stream_default='fasta'):
	"""
	Look up the BioPython file type corresponding to a file-like object.

	For stdin, stdout, and stderr, ``stream_default`` is used.
	"""
	if fh in (sys.stdin, sys.stdout, sys.stderr):
	return stream_default
	return from_filename(fh.name)


	class FileType(object):
	"""
	Near clone of argparse.FileType, supporting gzip and bzip
	"""
	def __init__(self, mode='r'):
	self.mode = mode
	self.ext_map = COMPRESS_EXT.copy()

	def _get_handle(self, file_path):
	ext = os.path.splitext(file_path)[1].lower()
	return self.ext_map.get(ext, open)(file_path, self.mode)

	def __call__(self, string):
	if string == '-':
	if 'r' in self.mode:
	return sys.stdin
	elif 'w' in self.mode:
	return sys.stdout
	else:
	raise ValueError("Invalid mode: {0}".format(string))
	else:
	return self._get_handle(string)
	from Bio.SeqIO import parse
	from toolbox import biofileformat
	import matplotlib.pyplot as plt
	import sys
	from collections import Counter
	import fileinput

	def get_lengths(fname, sub=False):
	"""
	Parses a sequence file and returns a list of sequence lengths
	"""
	with biofileformat.FileType('rb')(fname) as fh:
	seqformat = biofileformat.from_handle(fh)

	okformats = [ "fasta", "fastq" ]
	if seqformat not in okformats:
	print "takes only fasta/fastq w/wo compression"
	return

	if sub:
	lengths = []
	for n, rec in enumerate(parse(fh, seqformat)):
	sizes.append(len(rec))
	if n == (sub - 1):
	break
	else:
	lengths = [len(rec) for rec in parse(fh, seqformat)]

	return lengths

	def draw_histogram(lengths):
	"""
	Parses a sequence file and returns a plot of sequence lengths.
	Optional argument to subset the file.

	"""
	plt.hist(lengths)
	plt.title("%s\n%i sequences, range %i to %i bp" \
	% (fname, len(lengths), min(lengths),max(lengths)))
	plt.xlabel("Sequence length (bp)")
	plt.ylabel("Count")
	plt.show()

	return


	def plot_seqlength(fname, sub=False):
	draw_histogram(get_lengths(fname, sub=False))
	return

	def main(args):
	usage = "Usage: plot_lengths.py infile"

	if args[0] == "-h":
	print usage
	sys.exit()

	for fname in args:
	lengths = get_lengths(fname)
	counts = Counter()
	for l in lengths:
	counts[l] += 1
	outlines = [ '{}\t{}'.format( key, counts[key])
	for key in sorted(counts.keys()) ]

	sys.stdout.write('length\tcount\n')
	sys.stdout.write('\n'.join(outlines))
	sys.stdout.write('\n')

	if __name__ == '__main__':
	main(sys.argv[1:])