Run this notebook from the same directory as the EMBL files that will be submitted to the Otting Lab for IPD submissions. The output file name will be 'IPD_submission3_TIMESTAMP.csv', where TIMESTAMP is a unique identifier, but you may modify this to another filename if you wish.
The only required input is a tab-delimited file with the representative animal Identifiers and Comments, formatted as:
Working genomic allele name IPD Accession No. Representative Animal BLAST comments
>Mamu-B11L*01:04:01:01 NHP02117 MD103 7 identical fosmids (Rh22777)
CTCCCCGGACGCCTAGGATGGGGTCATGGCGCCTCGAGCCCTCCTCCTGCTGCTCTCGGGGGCCCTGGCCCTGACCGAGACCTGGG
Enter this filename in the next cell for the animalID_file
variable.
To run the notebook, run the next two cells, or select 'Run All Cells' from the menu above.
If the notebook finished with no errors, then you will see 'job done' appear.
Code
animalID_file = '22943_MESmerizer_MHC-I_IPD_cDNA-Identical_renamed_Mamu_gDNA_alleles_9Sep19.txt'
dirname = ''
outputName = ''
import glob
import os
import sys
from Bio import SeqIO
import time
import datetime
def parseAnimalIDFile(f):
l = []
header = True
with open(f, 'r') as fOpen:
for i in fOpen:
if header:
header = False
continue
i = i.rstrip('\r\n')
if i[0] == '>':
iLine = i[1:]
iSplit = iLine.split(',')
if len(iSplit) == 0 or len(iSplit) == 1:
print('Error with file formatting for animalID_file!')
return False
else:
l.append((iSplit[0], iSplit[1:]))
return l
def createCSVfile(animalID_file, dirname, outputName):
if not outputName:
outputName = 'IPD_submission3_'
# create file name with timestamp so that it is unique
# If you wish to use a different directory (NOT RECOMMENDED), then enter a value for dirname in the next line.
t_stamp = time.time()
t_stamp_string = datetime.datetime.fromtimestamp(t_stamp).strftime('%Y%m%d_%H%M%S')
outputFileName = outputName + t_stamp_string + '.csv'
# import animalID, IPD accession, and comments
animalID_list = parseAnimalIDFile(animalID_file)
if not animalID_list:
return False
# write header to output csv file
headerString = 'Section,Submittor ID,submission number,local name,Sequence type:,Accession Number(s),Release date,status,Current Non-human Primate species,sequence ,Cell/Animal ID/Code:,Material Available:,Primary Sequencing,Secondary Sequencing,Types of PCR primers:,Sequenced in isolation,Comments'
with open(outputFileName, 'a') as fWrite:
fWrite.write(headerString + '\n')
# get a list of the EMBL files in the directory
if not dirname:
dirname = os.getcwd()
dirnamePath = dirname + '/*.embl'
files = glob.glob(dirnamePath)
ct = 1
for i in files:
# extract the allele name for use in the output file and matching to the animalID_list
iName = os.path.basename(i)
iNameList = iName.split('.')
iNameParsed = iNameList[0]
res_tuple = list(filter(lambda x: x[0] == iNameParsed, animalID_list))
tupleParsedAsList = []
if not res_tuple:
print('Warning! no matching animalID found from animalID_list for ' + str(iNameParsed) + '!')
tupleParsedAsList = ['','','']
else:
tupleParsedAsList = res_tuple[0][1]
# import the EMBL flatfile, then parse out the sequence from it
seq_record = list(SeqIO.parse(i, "embl"))
seq_string = str(seq_record[-1].seq)
# create the csv line for the allele, then write it to the output file
outputString = 'Non-human Primates(NHP),10560G23,' + str(ct) + ',' + str(iNameParsed) + ',' + 'Full Length genomic CDS,' + str(tupleParsedAsList[0]) + ',As soon as possible,unpublished,Rhesus macaque (Mamu),' + str(seq_string) + ',' + str(tupleParsedAsList[1]) + ',' + 'No Material Available,Illumina NovaSeq 6000,,Locus specific,yes,' + str(tupleParsedAsList[2]) + ','
with open(outputFileName, 'a') as fWrite:
fWrite.write(outputString + '\n')
ct += 1
return True
if not animalID_file:
print('Please enter an animal ID before proceeding.')
else:
r = createCSVfile(animalID_file, dirname, outputName)
if r:
print('job done!')