coleoguy · January 3, 2016 22:09
diff --git a/retro-EEfinder.py b/retro-EEfinder.py
 from Bio import SeqIO  # to deal with the fast files

 # lets pull in our exon table
 datafile = open('exons.csv', 'r')
 data = []
 for row in datafile:
    data.append(row.strip().split(','))

 # now lets start the process of creating all of our exon-exon sequences
 # by first creating a list that has the name we want to assign and the LG and 
 # coordinates for it we will store this data in variable called targets
 targets = []
 for i in range(0, len(data)-1):
 	currentTC = data[i][3]
 	nextTC = data[i+1][3]
 	if currentTC == nextTC:
 		fooA = int(data[i][1])
 		fooB = int(data[i][2])
 		fooC = int(data[i+1][1])
 		fooD = int(data[i+1][2])
 		if fooB-fooA>29 and fooD-fooC>29:
 			line = []
 			B=int(data[i][2])
 			A=B-30
 			C=int(data[i+1][1])-1
 			D=C+30
 			names = data[i][0] + "|" + data[i+1][3] + "|" + str(A) + ":" + str(B) + "|" + str(C) + ":" + str(D)			
 			line.append(names)
 			line.append(A)
 			line.append(B)
 			line.append(C)
 			line.append(D)
 			targets.append(line)
 # at this point we have a list of lists in the variable "targets"
 # each element is an exon-exon junction 

 # lets check it
 #print targets[0]

 # no need to create this file each time as i work so I will comment out for now
 import csv
 with open('targets.csv', 'wb') as f:
 	wr = csv.writer(f)
 	wr.writerow(targets)

 # testing the interaction of this file with the chromosome files that I have is a bit odd
 # I am seeing lots of rows that are solid Ns not something that I believe is correct
 # I am going to try and download everything from ensembl and see if this fixes the problem
 # I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files
 # I would like to make sure that I am using all ensembl data as well because beetle base 
 # keeps being unaccessible for long periods of time (about a week currently)

 # now we can start building a final list: we want to have the first element
 # be the name ie "targets[i][0]" and the second element the DNA sequence it specifies

 def format_fasta(name, sequence):
    fasta_string = '>' + name + "\n" + sequence + "\n"
    return fasta_string
    
 fastaLines = []
 LGHandle = 0

 for i in range(0, len(targets)):     #len(targets)
 	currentLG = targets[i][0][0:5]
 	if LGHandle != currentLG:   # the next 8 lines just deal with the fact that file
 		fullname = currentLG    # names are a bit wonky the important line is 81 where
 		if currentLG == 'ChLG1': # where we open the appropriate fasta file
 			fullname = 'ChLG10'
 		if currentLG == 'Unkno':
 			fullname = 'Unknown'
 		LGHandle = currentLG
 		fileName = './chromosomes/' + fullname + '-2.fa'
 		record = SeqIO.read(open(fileName), "fasta")
 	A=targets[i][1]
 	B=targets[i][2]
 	C=targets[i][3]
 	D=targets[i][4]
 	fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D])))
 	FASTA = ''.join(fastaLines)

 text_file = open("EEJunctions.fa", "w")
 text_file.write(FASTA)
 text_file.close()
	from Bio import SeqIO # to deal with the fast files

	# lets pull in our exon table
	datafile = open('exons.csv', 'r')
	data = []
	for row in datafile:
	data.append(row.strip().split(','))

	# now lets start the process of creating all of our exon-exon sequences
	# by first creating a list that has the name we want to assign and the LG and
	# coordinates for it we will store this data in variable called targets
	targets = []
	for i in range(0, len(data)-1):
	currentTC = data[i][3]
	nextTC = data[i+1][3]
	if currentTC == nextTC:
	fooA = int(data[i][1])
	fooB = int(data[i][2])
	fooC = int(data[i+1][1])
	fooD = int(data[i+1][2])
	if fooB-fooA>29 and fooD-fooC>29:
	line = []
	B=int(data[i][2])
	A=B-30
	C=int(data[i+1][1])-1
	D=C+30
	names = data[i][0] + "\|" + data[i+1][3] + "\|" + str(A) + ":" + str(B) + "\|" + str(C) + ":" + str(D)
	line.append(names)
	line.append(A)
	line.append(B)
	line.append(C)
	line.append(D)
	targets.append(line)
	# at this point we have a list of lists in the variable "targets"
	# each element is an exon-exon junction

	# lets check it
	#print targets[0]

	# no need to create this file each time as i work so I will comment out for now
	import csv
	with open('targets.csv', 'wb') as f:
	wr = csv.writer(f)
	wr.writerow(targets)

	# testing the interaction of this file with the chromosome files that I have is a bit odd
	# I am seeing lots of rows that are solid Ns not something that I believe is correct
	# I am going to try and download everything from ensembl and see if this fixes the problem
	# I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files
	# I would like to make sure that I am using all ensembl data as well because beetle base
	# keeps being unaccessible for long periods of time (about a week currently)

	# now we can start building a final list: we want to have the first element
	# be the name ie "targets[i][0]" and the second element the DNA sequence it specifies

	def format_fasta(name, sequence):
	fasta_string = '>' + name + "\n" + sequence + "\n"
	return fasta_string

	fastaLines = []
	LGHandle = 0

	for i in range(0, len(targets)): #len(targets)
	currentLG = targets[i][0][0:5]
	if LGHandle != currentLG: # the next 8 lines just deal with the fact that file
	fullname = currentLG # names are a bit wonky the important line is 81 where
	if currentLG == 'ChLG1': # where we open the appropriate fasta file
	fullname = 'ChLG10'
	if currentLG == 'Unkno':
	fullname = 'Unknown'
	LGHandle = currentLG
	fileName = './chromosomes/' + fullname + '-2.fa'
	record = SeqIO.read(open(fileName), "fasta")
	A=targets[i][1]
	B=targets[i][2]
	C=targets[i][3]
	D=targets[i][4]
	fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D])))
	FASTA = ''.join(fastaLines)

	text_file = open("EEJunctions.fa", "w")
	text_file.write(FASTA)
	text_file.close()