Skip to content

Instantly share code, notes, and snippets.

@MarcSaric
Forked from daler/classify_reads.py
Created June 10, 2018 19:19
Show Gist options
  • Save MarcSaric/2f7453be3f01d1dfe69995fa33b31f55 to your computer and use it in GitHub Desktop.
Save MarcSaric/2f7453be3f01d1dfe69995fa33b31f55 to your computer and use it in GitHub Desktop.
classify reads using HTSeq
#!/usr/bin/python
# GPLv3 license
# Copyright 2011 Ryan Dale, all rights reserved
# [email protected]
import HTSeq
import argparse
import sys
import time
from collections import defaultdict
def classify(gff, bam, chroms=None, include=None, exclude=None,
stranded=False, verbose=False):
"""
Classify reads in *bam* based on featuretype as annotated in *gff*.
Default is to consider all featuretypes annotated; restrict these by
passing a list of featuretypes as *include* or *exclude*.
*chroms* is a dictionary of chromsizes to be passed to
HTSeq.GenomicArrayOfSets, or, if it's a string and you have `pybedtools`
installed, it will use the chromsizes for that assembly. If *chroms* is
None, then it will be set to 'auto', which lets each chromosome go to
infnity.
Observed sequence space of each class is reported as well. Using 'auto'
will cause your unannotated sequence space to be huge.
"""
t0 = time.time()
if verbose:
sys.stderr.write('Parsing GFF...')
sys.stderr.flush()
gff = HTSeq.GFF_Reader(gff)
bam = HTSeq.BAM_Reader(bam)
if chroms is None:
chroms = 'auto'
# Get chromsizes dict using pybedtools if a string was specified
if isinstance(chroms, basestring) and chroms != 'auto':
try:
import pybedtools
except ImportError:
sys.stderr.write("Can't import pybedtools to get chromsizes")
raise
pbt_chroms = pybedtools.chromsizes(chroms)
chroms = {}
for key, val in pbt_chroms.items():
chroms[key] = val[1]
# Init the data structure that is key to all of this:
gaos = HTSeq.GenomicArrayOfSets(chroms, stranded=stranded)
# Parse the GFF file, only adding the featuretypes specified
if include and exclude:
raise ValueError('Both include and exclude cannot be specified')
if include:
for feature in gff:
if feature.type in include:
try:
gaos[feature.iv] += feature.type
except IndexError:
# out of range
pass
except KeyError:
# missing chrom
pass
if exclude:
for feature in gff:
if feature.type not in exclude:
try:
gaos[feature.iv] += feature.type
except IndexError:
# out of range
pass
except KeyError:
# missing chrom
pass
if not include and not exclude:
for feature in gff:
try:
gaos[feature.iv] += feature.type
except IndexError:
# out of range
pass
except KeyError:
# missing chrom
pass
if verbose:
sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
t0 = time.time()
sys.stderr.write('Classifying reads...')
sys.stderr.flush()
# Iterate through BAM and create the set of classes that overlap the read.
#
# TODO: paired-end version?
results = defaultdict(int)
for read in bam:
intersection_set = None
for iv, step_set in gaos[read.iv].steps():
if intersection_set is None:
intersection_set = step_set.copy()
else:
intersection_set.intersection_update(step_set)
classes = list(intersection_set)
# Add read to the "all" categories
for cls in classes:
cls += '_all'
results[cls] += 1
# this gets a stable, hashable key for the set of classes
classes = ';'.join(sorted(classes))
results[classes] += 1
results['total'] += 1
if verbose:
sys.stderr.write('(%.1fs)\n' % (time.time() - t0))
t0 = time.time()
sys.stderr.write('Calculating sequence space...')
sys.stderr.flush()
# Get sequence space by iterating over the G.A.O.S.
seq_space = defaultdict(int)
for iv, classes in gaos.steps():
classes = list(classes)
# The "all" categories
for cls in classes:
cls += '_all'
seq_space[cls] += iv.length
cls = ';'.join(sorted(classes))
seq_space[cls] += iv.length
seq_space['total'] += iv.length
if verbose:
sys.stderr.write('(%.1fs)\n\n' % (time.time() - t0))
sys.stderr.flush()
try:
results['unannotated'] = results.pop('')
except KeyError:
results['unannotated'] = 0
try:
seq_space['unannotated'] = seq_space.pop('')
except KeyError:
seq_space['unannotated'] = 0
return results, seq_space
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--gff', help='GFF or GTF file')
ap.add_argument('--bam', help='BAM file of reads to classify')
ap.add_argument('--include', nargs='*', help='Featuretypes to include')
ap.add_argument('--exclude', nargs='*', help='Featuretypes to exclude')
ap.add_argument('--stranded', action='store_true',
help='Use stranded classification')
ap.add_argument('--assembly', help='Genome assembly (requires pybedtools)')
ap.add_argument('-v', '--verbose', action='store_true',
help='Verbose mode')
args = ap.parse_args()
results, seq_space = classify(gff=args.gff,
bam=args.bam,
include=args.include,
exclude=args.exclude,
verbose=args.verbose,
stranded=args.stranded,
chroms=args.assembly)
# Show most abundant first
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
for key, val in sorted_results:
sys.stdout.write('%s\t%s\t%s\n' % (key, val, seq_space[key]))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment