cbare · April 13, 2012 22:11
diff --git a/extract_features.py b/extract_features.py
 ############################################################
 ## A hacky script to extract features from a GFF3 file    ##
 ## and output them in a format compatible with the        ##
 ## Gaggle genome browser.                                 ##
 ##                                                        ##
 ## J. Christopher Bare, March 2012                        ##
 ##                                                        ##
 ############################################################

 # I got the gene annotations in GFF format from here:
 # http://tritrypdb.org/common/downloads/Current_Release/Lbraziliensis/gff/
 # http://tritrypdb.org/common/downloads/Current_Release/Lmajor/gff/
 # http://tritrypdb.org/common/downloads/Current_Release/Linfantum/gff/

 # Description of the GFF format:
 # http://gmod.org/wiki/GFF

 import sys
 import re

 filename = sys.argv[1]

 counter = {}

 features = {}

 class Feature:
    def __init__(self,sequence,strand,start,end,name,common_name,type,parent_id):
        self.sequence=sequence
        self.strand=strand
        self.start=start
        self.end=end
        self.name=name
        self.common_name=common_name
        self.type=type
        self.parent_id=parent_id


 with open(filename, 'r') as f:
    for line in f:
        if line.startswith("##FASTA"):
            break
        if line.startswith('#'):
            continue
        
        # process features in gff format
        fields = line.split("\t")
        
        type = fields[2]
        
        # count up how many of each type of feature we have
        if counter.has_key(type):
            counter[type] +=1
        else:
            counter[type] = 1
        
        # don't need these, throw 'em out
        if type=="supercontig":
            continue
        
        # parse out the name of the chromosomes
        m = re.match("GeneDB\|(.*)",fields[0])
        if m:
            seq = m.group(1)
        else:
            raise Exception('Unrecognized sequence format')
        
        start = fields[3]
        end = fields[4]
        strand = fields[6]
        
        # People always screw up the attributes field which is suppose to look like this:
        # key1=value1;key2=value2;key3=value3a,value3b,value3c
        attrs_field = fields[8]
        
        # clean up attr_field:
        # if there's a semi-colon that doesn't look like it belongs there, remove it.
        if ";," in attrs_field:
            attrs_field = attrs_field.replace(";,",",")
        #print attrs_field
        
        attributes = { k:v for (k,v) in [kvp.split("=") for kvp in attrs_field.split(";")] }
        id = attributes['ID']
        name = attributes.get('locus_tag',attributes.get('Name',id))
        parent_id = attributes.get('Parent')
        
        # grab all the features in case we want to do some post-processing
        # features[id] = Feature(seq,strand,start,end,name,None,type, parent_id)
        
        # here we're just interested in the "gene" features - not the redundant stuff
        # like rRNA, snRNA, tRNA, transcript, mRNA, CDS, which all have a corresponding
        # parent feature which is of type "gene".
        # We also ignore exon features, 'cause our gene model is just a block with a start
        # and end.
        if type=="gene":
            if ".tRNA" in name:
                type = "trna"
            if ".rRNA" in name:
                type = "rrna"
            if ".ncRNA" in name:
                type = "ncrna"
            if ".snRNA" in name:
                type = "snrna"
            if ".snoRNA" in name:
                type = "snorna"
            if 
            print "\t".join( [seq,strand,start,end,name,"",type] )
            

 # for key,value in counter.items():
 #     print key + " => " + str(value)
 # 
 # print "features: " + str(len(features))
	############################################################
	## A hacky script to extract features from a GFF3 file ##
	## and output them in a format compatible with the ##
	## Gaggle genome browser. ##
	## ##
	## J. Christopher Bare, March 2012 ##
	## ##
	############################################################

	# I got the gene annotations in GFF format from here:
	# http://tritrypdb.org/common/downloads/Current_Release/Lbraziliensis/gff/
	# http://tritrypdb.org/common/downloads/Current_Release/Lmajor/gff/
	# http://tritrypdb.org/common/downloads/Current_Release/Linfantum/gff/

	# Description of the GFF format:
	# http://gmod.org/wiki/GFF

	import sys
	import re

	filename = sys.argv[1]

	counter = {}

	features = {}

	class Feature:
	def __init__(self,sequence,strand,start,end,name,common_name,type,parent_id):
	self.sequence=sequence
	self.strand=strand
	self.start=start
	self.end=end
	self.name=name
	self.common_name=common_name
	self.type=type
	self.parent_id=parent_id


	with open(filename, 'r') as f:
	for line in f:
	if line.startswith("##FASTA"):
	break
	if line.startswith('#'):
	continue

	# process features in gff format
	fields = line.split("\t")

	type = fields[2]

	# count up how many of each type of feature we have
	if counter.has_key(type):
	counter[type] +=1
	else:
	counter[type] = 1

	# don't need these, throw 'em out
	if type=="supercontig":
	continue

	# parse out the name of the chromosomes
	m = re.match("GeneDB\\|(.*)",fields[0])
	if m:
	seq = m.group(1)
	else:
	raise Exception('Unrecognized sequence format')

	start = fields[3]
	end = fields[4]
	strand = fields[6]

	# People always screw up the attributes field which is suppose to look like this:
	# key1=value1;key2=value2;key3=value3a,value3b,value3c
	attrs_field = fields[8]

	# clean up attr_field:
	# if there's a semi-colon that doesn't look like it belongs there, remove it.
	if ";," in attrs_field:
	attrs_field = attrs_field.replace(";,",",")
	#print attrs_field

	attributes = { k:v for (k,v) in [kvp.split("=") for kvp in attrs_field.split(";")] }
	id = attributes['ID']
	name = attributes.get('locus_tag',attributes.get('Name',id))
	parent_id = attributes.get('Parent')

	# grab all the features in case we want to do some post-processing
	# features[id] = Feature(seq,strand,start,end,name,None,type, parent_id)

	# here we're just interested in the "gene" features - not the redundant stuff
	# like rRNA, snRNA, tRNA, transcript, mRNA, CDS, which all have a corresponding
	# parent feature which is of type "gene".
	# We also ignore exon features, 'cause our gene model is just a block with a start
	# and end.
	if type=="gene":
	if ".tRNA" in name:
	type = "trna"
	if ".rRNA" in name:
	type = "rrna"
	if ".ncRNA" in name:
	type = "ncrna"
	if ".snRNA" in name:
	type = "snrna"
	if ".snoRNA" in name:
	type = "snorna"
	if
	print "\t".join( [seq,strand,start,end,name,"",type] )


	# for key,value in counter.items():
	# print key + " => " + str(value)
	#
	# print "features: " + str(len(features))