Last active
January 3, 2016 22:09
-
-
Save coleoguy/8526218 to your computer and use it in GitHub Desktop.
This python script takes a list of exons from multiple exon genes as well as fast files for each chromosome in a genome and it constructs a fasta file where each sequence is 60bp in length (last 30bp of one exon and the first 30bp of the next). These can then be used to search the genome for retroduplication events of genes. By limiting our selv…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio import SeqIO # to deal with the fast files | |
# lets pull in our exon table | |
datafile = open('exons.csv', 'r') | |
data = [] | |
for row in datafile: | |
data.append(row.strip().split(',')) | |
# now lets start the process of creating all of our exon-exon sequences | |
# by first creating a list that has the name we want to assign and the LG and | |
# coordinates for it we will store this data in variable called targets | |
targets = [] | |
for i in range(0, len(data)-1): | |
currentTC = data[i][3] | |
nextTC = data[i+1][3] | |
if currentTC == nextTC: | |
fooA = int(data[i][1]) | |
fooB = int(data[i][2]) | |
fooC = int(data[i+1][1]) | |
fooD = int(data[i+1][2]) | |
if fooB-fooA>29 and fooD-fooC>29: | |
line = [] | |
B=int(data[i][2]) | |
A=B-30 | |
C=int(data[i+1][1])-1 | |
D=C+30 | |
names = data[i][0] + "|" + data[i+1][3] + "|" + str(A) + ":" + str(B) + "|" + str(C) + ":" + str(D) | |
line.append(names) | |
line.append(A) | |
line.append(B) | |
line.append(C) | |
line.append(D) | |
targets.append(line) | |
# at this point we have a list of lists in the variable "targets" | |
# each element is an exon-exon junction | |
# lets check it | |
#print targets[0] | |
# no need to create this file each time as i work so I will comment out for now | |
import csv | |
with open('targets.csv', 'wb') as f: | |
wr = csv.writer(f) | |
wr.writerow(targets) | |
# testing the interaction of this file with the chromosome files that I have is a bit odd | |
# I am seeing lots of rows that are solid Ns not something that I believe is correct | |
# I am going to try and download everything from ensembl and see if this fixes the problem | |
# I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files | |
# I would like to make sure that I am using all ensembl data as well because beetle base | |
# keeps being unaccessible for long periods of time (about a week currently) | |
# now we can start building a final list: we want to have the first element | |
# be the name ie "targets[i][0]" and the second element the DNA sequence it specifies | |
def format_fasta(name, sequence): | |
fasta_string = '>' + name + "\n" + sequence + "\n" | |
return fasta_string | |
fastaLines = [] | |
LGHandle = 0 | |
for i in range(0, len(targets)): #len(targets) | |
currentLG = targets[i][0][0:5] | |
if LGHandle != currentLG: # the next 8 lines just deal with the fact that file | |
fullname = currentLG # names are a bit wonky the important line is 81 where | |
if currentLG == 'ChLG1': # where we open the appropriate fasta file | |
fullname = 'ChLG10' | |
if currentLG == 'Unkno': | |
fullname = 'Unknown' | |
LGHandle = currentLG | |
fileName = './chromosomes/' + fullname + '-2.fa' | |
record = SeqIO.read(open(fileName), "fasta") | |
A=targets[i][1] | |
B=targets[i][2] | |
C=targets[i][3] | |
D=targets[i][4] | |
fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D]))) | |
FASTA = ''.join(fastaLines) | |
text_file = open("EEJunctions.fa", "w") | |
text_file.write(FASTA) | |
text_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment