Created
April 13, 2012 22:11
-
-
Save cbare/2380465 to your computer and use it in GitHub Desktop.
Extract features from GFF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################ | |
## A hacky script to extract features from a GFF3 file ## | |
## and output them in a format compatible with the ## | |
## Gaggle genome browser. ## | |
## ## | |
## J. Christopher Bare, March 2012 ## | |
## ## | |
############################################################ | |
# I got the gene annotations in GFF format from here: | |
# http://tritrypdb.org/common/downloads/Current_Release/Lbraziliensis/gff/ | |
# http://tritrypdb.org/common/downloads/Current_Release/Lmajor/gff/ | |
# http://tritrypdb.org/common/downloads/Current_Release/Linfantum/gff/ | |
# Description of the GFF format: | |
# http://gmod.org/wiki/GFF | |
import sys | |
import re | |
filename = sys.argv[1] | |
counter = {} | |
features = {} | |
class Feature: | |
def __init__(self,sequence,strand,start,end,name,common_name,type,parent_id): | |
self.sequence=sequence | |
self.strand=strand | |
self.start=start | |
self.end=end | |
self.name=name | |
self.common_name=common_name | |
self.type=type | |
self.parent_id=parent_id | |
with open(filename, 'r') as f: | |
for line in f: | |
if line.startswith("##FASTA"): | |
break | |
if line.startswith('#'): | |
continue | |
# process features in gff format | |
fields = line.split("\t") | |
type = fields[2] | |
# count up how many of each type of feature we have | |
if counter.has_key(type): | |
counter[type] +=1 | |
else: | |
counter[type] = 1 | |
# don't need these, throw 'em out | |
if type=="supercontig": | |
continue | |
# parse out the name of the chromosomes | |
m = re.match("GeneDB\|(.*)",fields[0]) | |
if m: | |
seq = m.group(1) | |
else: | |
raise Exception('Unrecognized sequence format') | |
start = fields[3] | |
end = fields[4] | |
strand = fields[6] | |
# People always screw up the attributes field which is suppose to look like this: | |
# key1=value1;key2=value2;key3=value3a,value3b,value3c | |
attrs_field = fields[8] | |
# clean up attr_field: | |
# if there's a semi-colon that doesn't look like it belongs there, remove it. | |
if ";," in attrs_field: | |
attrs_field = attrs_field.replace(";,",",") | |
#print attrs_field | |
attributes = { k:v for (k,v) in [kvp.split("=") for kvp in attrs_field.split(";")] } | |
id = attributes['ID'] | |
name = attributes.get('locus_tag',attributes.get('Name',id)) | |
parent_id = attributes.get('Parent') | |
# grab all the features in case we want to do some post-processing | |
# features[id] = Feature(seq,strand,start,end,name,None,type, parent_id) | |
# here we're just interested in the "gene" features - not the redundant stuff | |
# like rRNA, snRNA, tRNA, transcript, mRNA, CDS, which all have a corresponding | |
# parent feature which is of type "gene". | |
# We also ignore exon features, 'cause our gene model is just a block with a start | |
# and end. | |
if type=="gene": | |
if ".tRNA" in name: | |
type = "trna" | |
if ".rRNA" in name: | |
type = "rrna" | |
if ".ncRNA" in name: | |
type = "ncrna" | |
if ".snRNA" in name: | |
type = "snrna" | |
if ".snoRNA" in name: | |
type = "snorna" | |
if | |
print "\t".join( [seq,strand,start,end,name,"",type] ) | |
# for key,value in counter.items(): | |
# print key + " => " + str(value) | |
# | |
# print "features: " + str(len(features)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment