Created
May 26, 2011 14:36
-
-
Save daler/993273 to your computer and use it in GitHub Desktop.
classify reads using HTSeq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# GPLv3 license | |
# Copyright 2011 Ryan Dale, all rights reserved | |
# [email protected] | |
import HTSeq | |
import argparse | |
import sys | |
import time | |
from collections import defaultdict | |
def classify(gff, bam, chroms=None, include=None, exclude=None, | |
stranded=False, verbose=False): | |
""" | |
Classify reads in *bam* based on featuretype as annotated in *gff*. | |
Default is to consider all featuretypes annotated; restrict these by | |
passing a list of featuretypes as *include* or *exclude*. | |
*chroms* is a dictionary of chromsizes to be passed to | |
HTSeq.GenomicArrayOfSets, or, if it's a string and you have `pybedtools` | |
installed, it will use the chromsizes for that assembly. If *chroms* is | |
None, then it will be set to 'auto', which lets each chromosome go to | |
infnity. | |
Observed sequence space of each class is reported as well. Using 'auto' | |
will cause your unannotated sequence space to be huge. | |
""" | |
t0 = time.time() | |
if verbose: | |
sys.stderr.write('Parsing GFF...') | |
sys.stderr.flush() | |
gff = HTSeq.GFF_Reader(gff) | |
bam = HTSeq.BAM_Reader(bam) | |
if chroms is None: | |
chroms = 'auto' | |
# Get chromsizes dict using pybedtools if a string was specified | |
if isinstance(chroms, basestring) and chroms != 'auto': | |
try: | |
import pybedtools | |
except ImportError: | |
sys.stderr.write("Can't import pybedtools to get chromsizes") | |
raise | |
pbt_chroms = pybedtools.chromsizes(chroms) | |
chroms = {} | |
for key, val in pbt_chroms.items(): | |
chroms[key] = val[1] | |
# Init the data structure that is key to all of this: | |
gaos = HTSeq.GenomicArrayOfSets(chroms, stranded=stranded) | |
# Parse the GFF file, only adding the featuretypes specified | |
if include and exclude: | |
raise ValueError('Both include and exclude cannot be specified') | |
if include: | |
for feature in gff: | |
if feature.type in include: | |
try: | |
gaos[feature.iv] += feature.type | |
except IndexError: | |
# out of range | |
pass | |
except KeyError: | |
# missing chrom | |
pass | |
if exclude: | |
for feature in gff: | |
if feature.type not in exclude: | |
try: | |
gaos[feature.iv] += feature.type | |
except IndexError: | |
# out of range | |
pass | |
except KeyError: | |
# missing chrom | |
pass | |
if not include and not exclude: | |
for feature in gff: | |
try: | |
gaos[feature.iv] += feature.type | |
except IndexError: | |
# out of range | |
pass | |
except KeyError: | |
# missing chrom | |
pass | |
if verbose: | |
sys.stderr.write('(%.1fs)\n' % (time.time() - t0)) | |
t0 = time.time() | |
sys.stderr.write('Classifying reads...') | |
sys.stderr.flush() | |
# Iterate through BAM and create the set of classes that overlap the read. | |
# | |
# TODO: paired-end version? | |
results = defaultdict(int) | |
for read in bam: | |
intersection_set = None | |
for iv, step_set in gaos[read.iv].steps(): | |
if intersection_set is None: | |
intersection_set = step_set.copy() | |
else: | |
intersection_set.intersection_update(step_set) | |
classes = list(intersection_set) | |
# Add read to the "all" categories | |
for cls in classes: | |
cls += '_all' | |
results[cls] += 1 | |
# this gets a stable, hashable key for the set of classes | |
classes = ';'.join(sorted(classes)) | |
results[classes] += 1 | |
results['total'] += 1 | |
if verbose: | |
sys.stderr.write('(%.1fs)\n' % (time.time() - t0)) | |
t0 = time.time() | |
sys.stderr.write('Calculating sequence space...') | |
sys.stderr.flush() | |
# Get sequence space by iterating over the G.A.O.S. | |
seq_space = defaultdict(int) | |
for iv, classes in gaos.steps(): | |
classes = list(classes) | |
# The "all" categories | |
for cls in classes: | |
cls += '_all' | |
seq_space[cls] += iv.length | |
cls = ';'.join(sorted(classes)) | |
seq_space[cls] += iv.length | |
seq_space['total'] += iv.length | |
if verbose: | |
sys.stderr.write('(%.1fs)\n\n' % (time.time() - t0)) | |
sys.stderr.flush() | |
try: | |
results['unannotated'] = results.pop('') | |
except KeyError: | |
results['unannotated'] = 0 | |
try: | |
seq_space['unannotated'] = seq_space.pop('') | |
except KeyError: | |
seq_space['unannotated'] = 0 | |
return results, seq_space | |
def main(): | |
ap = argparse.ArgumentParser() | |
ap.add_argument('--gff', help='GFF or GTF file') | |
ap.add_argument('--bam', help='BAM file of reads to classify') | |
ap.add_argument('--include', nargs='*', help='Featuretypes to include') | |
ap.add_argument('--exclude', nargs='*', help='Featuretypes to exclude') | |
ap.add_argument('--stranded', action='store_true', | |
help='Use stranded classification') | |
ap.add_argument('--assembly', help='Genome assembly (requires pybedtools)') | |
ap.add_argument('-v', '--verbose', action='store_true', | |
help='Verbose mode') | |
args = ap.parse_args() | |
results, seq_space = classify(gff=args.gff, | |
bam=args.bam, | |
include=args.include, | |
exclude=args.exclude, | |
verbose=args.verbose, | |
stranded=args.stranded, | |
chroms=args.assembly) | |
# Show most abundant first | |
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True) | |
for key, val in sorted_results: | |
sys.stdout.write('%s\t%s\t%s\n' % (key, val, seq_space[key])) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment