Created
April 1, 2015 10:06
-
-
Save pjbriggs/6229017c8dba994dd213 to your computer and use it in GitHub Desktop.
Given a GTF file, check that for each exon entry the associated features 'CDS', 'stop_codon' and 'start_codon' are consistent and specify the same range of bases as # their "parent" exon
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
# | |
# Check exon coverage in a GTF file | |
# Peter Briggs UoM, March 2015 | |
# | |
# Purpose | |
# ------- | |
# Given a GTF file, check that for each exon entry the | |
# associated features 'CDS', 'stop_codon' and 'start_codon' | |
# are consistent and specify the same range of bases as | |
# their "parent" exon. For example: | |
# DS499594 protein_coding exon 7780 9931 . - ... | |
# DS499594 protein_coding CDS 7780 9931 . - ... | |
# DS499594 protein_coding start_codon 9929 9931 . ... | |
# | |
# Assumptions | |
# ----------- | |
# The program assumes: | |
# 1. An 'exon' entry always preceeds the entries for the | |
# associated features | |
# 2. Only 'protein_coding' exons are of interest | |
# | |
# Usage | |
# ----- | |
# check_gtf_exon_coverage.py FILE.gtf | |
# | |
# Outputs | |
# ------- | |
# For each exon encountered, report the following issues (if | |
# discovered): | |
# | |
# - Multiple features e.g. more than one CDS, stop_codon etc | |
# appears to be associated with the exon entry | |
# - Exons where the total range covered by the associated | |
# features is not the same as the extent of the exon | |
# - One or more features appear to be outside the exon | |
# | |
# If the exon and features are consistent then report nothing. | |
# | |
# Setup | |
# ----- | |
# Needs the bcftbx library from | |
# https://github.com/fls-bioinformatics-core/genomics | |
# and the GTFFile and related modules from | |
# https://github.com/fls-bioinformatics-core/GFFUtils | |
# | |
# You can 'git clone' both modules and then add the resulting | |
# directories to your PYTHONPATH. | |
# | |
import sys | |
import optparse | |
import GTFFile | |
class GTFFeature: | |
# Class for storing information on a generic GTF | |
# feature | |
# | |
# Set the 'feature' property after instantiation | |
# e.g. f.feature = 'CDS' | |
def __init__(self,start,end,strand): | |
self.feature = 'feature' | |
self.start = start | |
self.end = end | |
self.strand = strand | |
def contains(self,start,end): | |
# Return True if feature completely contains | |
# the region specified by 'start' and 'end' | |
return self.start <= start and self.end >= end | |
def __repr__(self): | |
return "%s: %d-%d %s" % (self.feature.upper(), | |
self.start, | |
self.end, | |
self.strand) | |
class Exon(GTFFeature): | |
# Class for storing information on an exon | |
# | |
# This is a sublass of the GTFFeature class | |
def __init__(self,*args): | |
GTFFeature.__init__(self,*args) | |
self.feature = 'exon' | |
def check_exon(exon,exon_line,features): | |
# Perform consistency checks on an exon given a list | |
# of associated features | |
# | |
# Arguments: | |
# - exon : 'Exon' class instance | |
# - exon_line: line number in the input GTF where | |
# the exon is declared | |
# - features: list of 'GTFFeature' instances for | |
# each of the associated features | |
# | |
# Performs checks and reports to stdout if there are | |
# any issues. | |
# | |
bad_entry = False | |
# Check there are features | |
if len(features) == 0: | |
print "WARNING no features for exon" | |
bad_entry = True | |
# Check the features are all inside the exon | |
for ft in features: | |
if not exon.contains(ft.start,ft.end): | |
print "WARNING %s is not inside preceeding exon" % ft.feature | |
bad_entry = True | |
# Check for duplicated features | |
for ft_type in ('CDS','start_codon','stop_codon'): | |
count = 0 | |
for ft in features: | |
if ft.feature == ft_type: count += 1 | |
if count > 1: | |
print "WARNING multiple %ss" % ft_type | |
bad_entry = True | |
# Check that the features cover the exon | |
# Get min and max positions | |
min_pos = None | |
max_pos = None | |
for ft in features: | |
if min_pos is None: | |
min_pos = ft.start | |
if max_pos is None: | |
max_pos = ft.end | |
min_pos = min(min_pos,ft.start) | |
max_pos = max(max_pos,ft.end) | |
if min_pos != exon.start or max_pos != exon.end: | |
print "WARNING features don't cover the preceeding exon" | |
bad_entry = True | |
# Report "bad" entry | |
if bad_entry: | |
print "%s (L%d)" % (exon,exon_line) | |
if min_pos is not None and max_pos is not None: | |
print "Features cover %d-%d" % (min_pos,max_pos) | |
if len(features): | |
print "List of associated features:" | |
for ft in features: | |
print "- %s" % ft | |
print "-"*60 | |
if __name__ == '__main__': | |
# Handle command line | |
p = optparse.OptionParser(usage="%prog GTFFILE", | |
description="Check CDS and start/stop_exon entries " | |
"span exon entries") | |
opts,args = p.parse_args() | |
if len(args) != 1: | |
p.error("Need to supply GTF file") | |
# Process GTF file | |
exon = None | |
exon_line = None | |
features = [] | |
for line in GTFFile.GTFIterator(args[0]): | |
# For each line in file: | |
# - when a new exon is encountered | |
# * process any preceeding exon data and report, then discard | |
# * store the new exon data | |
# - for other features, add these to the list of features | |
# associated with that exon | |
#print str(line) | |
if line['source'] != 'protein_coding': | |
continue | |
if line['feature'] in ('CDS','start_codon','stop_codon'): | |
if exon is None: | |
raise Exception("No exon defined!") | |
ft = GTFFeature(line['start'],line['end'],line['strand']) | |
ft.feature = line['feature'] | |
features.append(ft) | |
#print "- %s: %d-%d %s" % (f.feature,f.start,f.end,f.strand) | |
if line['feature'] == 'exon': | |
if exon is not None: | |
check_exon(exon,exon_line,features) | |
# Reset for next exon | |
exon = Exon(line['start'],line['end'],line['strand']) | |
exon_line = line.lineno() | |
features = [] | |
#print exon | |
# Check last exon | |
check_exon(exon,exon_line,features) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment