Created
October 27, 2013 20:25
-
-
Save aaronsaunders/7187482 to your computer and use it in GitHub Desktop.
file parsing and length plotting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Mappings from file extensions to biopython types. | |
Copied from Erik Matsens seqmagick https://github.com/fhcrc/seqmagick/ | |
""" | |
import argparse | |
import contextlib | |
import copy | |
import functools | |
import os | |
import os.path | |
import signal | |
import sys | |
import tempfile | |
import bz2 | |
import gzip | |
import os.path | |
import sys | |
# Define mappings in a dictionary with extension : BioPython_file_type. | |
EXTENSION_TO_TYPE = {'.aln': 'clustal', | |
'.afa': 'fasta', | |
'.fa': 'fasta', | |
'.faa': 'fasta', | |
'.fas': 'fasta', | |
'.fasta': 'fasta', | |
'.fastq': 'fastq', | |
'.fq': 'fastq', | |
'.ffn': 'fasta', | |
'.fna': 'fasta', | |
'.frn': 'fasta', | |
'.gb': 'genbank', | |
'.gbk': 'genbank', | |
'.needle': 'emboss', | |
'.nex': 'nexus', | |
'.phy': 'phylip', | |
'.phylip': 'phylip', | |
'.phyx': 'phylip-relaxed', | |
'.qual': 'qual', | |
'.sff': 'sff-trim', | |
'.sth': 'stockholm', | |
'.sto': 'stockholm',} | |
COMPRESS_EXT = {'.bz2': bz2.BZ2File, '.gz': gzip.open, '.bz': bz2.BZ2File} | |
class UnknownExtensionError(ValueError): | |
pass | |
def from_extension(extension): | |
""" | |
Look up the BioPython file type corresponding with input extension. | |
Look up is case insensitive. | |
""" | |
if not extension.startswith('.'): | |
raise ValueError("Extensions must begin with a period.") | |
try: | |
return EXTENSION_TO_TYPE[extension.lower()] | |
except KeyError: | |
raise UnknownExtensionError("seqmagick does not know how to handle " + | |
"files with extensions like this: " + extension) | |
def from_filename(file_name): | |
""" | |
Look up the BioPython file type corresponding to an input file name. | |
""" | |
base, extension = os.path.splitext(file_name) | |
if extension in COMPRESS_EXT: | |
# Compressed file | |
extension = os.path.splitext(base)[1] | |
return from_extension(extension) | |
def from_handle(fh, stream_default='fasta'): | |
""" | |
Look up the BioPython file type corresponding to a file-like object. | |
For stdin, stdout, and stderr, ``stream_default`` is used. | |
""" | |
if fh in (sys.stdin, sys.stdout, sys.stderr): | |
return stream_default | |
return from_filename(fh.name) | |
class FileType(object): | |
""" | |
Near clone of argparse.FileType, supporting gzip and bzip | |
""" | |
def __init__(self, mode='r'): | |
self.mode = mode | |
self.ext_map = COMPRESS_EXT.copy() | |
def _get_handle(self, file_path): | |
ext = os.path.splitext(file_path)[1].lower() | |
return self.ext_map.get(ext, open)(file_path, self.mode) | |
def __call__(self, string): | |
if string == '-': | |
if 'r' in self.mode: | |
return sys.stdin | |
elif 'w' in self.mode: | |
return sys.stdout | |
else: | |
raise ValueError("Invalid mode: {0}".format(string)) | |
else: | |
return self._get_handle(string) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio.SeqIO import parse | |
from toolbox import biofileformat | |
import matplotlib.pyplot as plt | |
import sys | |
from collections import Counter | |
import fileinput | |
def get_lengths(fname, sub=False): | |
""" | |
Parses a sequence file and returns a list of sequence lengths | |
""" | |
with biofileformat.FileType('rb')(fname) as fh: | |
seqformat = biofileformat.from_handle(fh) | |
okformats = [ "fasta", "fastq" ] | |
if seqformat not in okformats: | |
print "takes only fasta/fastq w/wo compression" | |
return | |
if sub: | |
lengths = [] | |
for n, rec in enumerate(parse(fh, seqformat)): | |
sizes.append(len(rec)) | |
if n == (sub - 1): | |
break | |
else: | |
lengths = [len(rec) for rec in parse(fh, seqformat)] | |
return lengths | |
def draw_histogram(lengths): | |
""" | |
Parses a sequence file and returns a plot of sequence lengths. | |
Optional argument to subset the file. | |
""" | |
plt.hist(lengths) | |
plt.title("%s\n%i sequences, range %i to %i bp" \ | |
% (fname, len(lengths), min(lengths),max(lengths))) | |
plt.xlabel("Sequence length (bp)") | |
plt.ylabel("Count") | |
plt.show() | |
return | |
def plot_seqlength(fname, sub=False): | |
draw_histogram(get_lengths(fname, sub=False)) | |
return | |
def main(args): | |
usage = "Usage: plot_lengths.py infile" | |
if args[0] == "-h": | |
print usage | |
sys.exit() | |
for fname in args: | |
lengths = get_lengths(fname) | |
counts = Counter() | |
for l in lengths: | |
counts[l] += 1 | |
outlines = [ '{}\t{}'.format( key, counts[key]) | |
for key in sorted(counts.keys()) ] | |
sys.stdout.write('length\tcount\n') | |
sys.stdout.write('\n'.join(outlines)) | |
sys.stdout.write('\n') | |
if __name__ == '__main__': | |
main(sys.argv[1:]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment