coleoguy · January 3, 2016 20:29
diff --git a/retro-gff3parser.py b/retro-gff3parser.py
 # the gff3 file that was used as input for this script was downloaded from:
 # ftp://ftp.ensemblgenomes.org/pub/metazoa/release-21/gff3/tribolium_castaneum
 # on January 10, 2014

 import re
 ## This script is just to parse out the part of the gff file that we
 ## are interested in, (i.e. lines describing exons or protein coding genes


 ## Here we specify the name of the gff3 
 ## file that we would like to parse
        #I would like to have the command line allow you to enter a file name
        #DataIn = input('Enter your file name: ')     
        #right now you have to quote your file name?
 DataIn = "./chromosomes/t.cast.gff3"
 gff = open(DataIn)
 lines = gff.readlines()


 ## now we create a list of multiple exon genes
 ## the variable exonLines will hold all lines 
 ## with the feature name exon
 exonLines = []
 for i in range(0, len(lines)):
 	foo = lines[i].split()    
 	tempLines = []      
 	if len(foo) > 2 and foo[2] == 'exon':
 		tempLines.append(foo[0])
 		tempLines.append(foo[3])
 		tempLines.append(foo[4])
 		bar = foo[8].split(':')
 		tempLines.append(bar[1][0:8])
 		exonLines.append(tempLines) 
 		
 		
 ## now we just want to create a list of TC numbers
 ## we will use this to get a list of all TCs that 
 ## have more than a single exon		
 tcList = []
 for i in range(0, len(exonLines)):
 	tcList.append(exonLines[i][3])
 from collections import Counter
 foo = Counter(tcList)
 dictTC = {k:v for (k,v) in foo.items() if v > 1}  # this is the line that pulls the > 1 exons
 multiexon = list(dictTC)

 ## now lets create a final list that has just the info
 ## we care about for the multi exon genes
 multiExonLines = []
 for i in range(0, len(exonLines)):
 	if exonLines[i][3] in multiexon:
 		multiExonLines.append(exonLines[i])

 print multiExonLines[1]
 print multiExonLines[2]


 import csv
 with open('exons.csv', 'wb') as f:
 	wr = csv.writer(f)
 	wr.writerow(multiExonLines)
	# the gff3 file that was used as input for this script was downloaded from:
	# ftp://ftp.ensemblgenomes.org/pub/metazoa/release-21/gff3/tribolium_castaneum
	# on January 10, 2014

	import re
	## This script is just to parse out the part of the gff file that we
	## are interested in, (i.e. lines describing exons or protein coding genes


	## Here we specify the name of the gff3
	## file that we would like to parse
	#I would like to have the command line allow you to enter a file name
	#DataIn = input('Enter your file name: ')
	#right now you have to quote your file name?
	DataIn = "./chromosomes/t.cast.gff3"
	gff = open(DataIn)
	lines = gff.readlines()


	## now we create a list of multiple exon genes
	## the variable exonLines will hold all lines
	## with the feature name exon
	exonLines = []
	for i in range(0, len(lines)):
	foo = lines[i].split()
	tempLines = []
	if len(foo) > 2 and foo[2] == 'exon':
	tempLines.append(foo[0])
	tempLines.append(foo[3])
	tempLines.append(foo[4])
	bar = foo[8].split(':')
	tempLines.append(bar[1][0:8])
	exonLines.append(tempLines)


	## now we just want to create a list of TC numbers
	## we will use this to get a list of all TCs that
	## have more than a single exon
	tcList = []
	for i in range(0, len(exonLines)):
	tcList.append(exonLines[i][3])
	from collections import Counter
	foo = Counter(tcList)
	dictTC = {k:v for (k,v) in foo.items() if v > 1} # this is the line that pulls the > 1 exons
	multiexon = list(dictTC)

	## now lets create a final list that has just the info
	## we care about for the multi exon genes
	multiExonLines = []
	for i in range(0, len(exonLines)):
	if exonLines[i][3] in multiexon:
	multiExonLines.append(exonLines[i])

	print multiExonLines[1]
	print multiExonLines[2]


	import csv
	with open('exons.csv', 'wb') as f:
	wr = csv.writer(f)
	wr.writerow(multiExonLines)