Created
November 29, 2012 17:06
-
-
Save radaniba/4170428 to your computer and use it in GitHub Desktop.
Obtaining the actual sequence making up a gene from NCBI is simple using a browser, but not so much when wanting to do it in batch. This script obtains the nucleotide sequences, mRNA's, CDS's and protein sequences associated with a list of gene IDs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from Bio.SeqRecord import SeqRecord | |
| from Bio.Seq import Seq | |
| from Bio import Entrez | |
| from Bio import SeqIO | |
| import time | |
| # Obtain your gene IDs from somewhere (file, past text directly, etc.) | |
| ids = [] | |
| ids = ",".join( lines ) | |
| # From Biopython: Always tell Entrez qho you are! | |
| Entrez.email = "my_mail@world.com" | |
| handle = Entrez.efetch(db="gene", id=ids, retmode="xml") | |
| #Save this query to file, so we don't have to download it again. | |
| out = open( "output.xml", "w") | |
| out.write(handle.read()) | |
| out.close() | |
| handle.close() | |
| #Parse the XML file into gene records | |
| records = Entrez.read( open("output.xml","r") ) | |
| #This dictionary maps verbose strand identification to the argument that efetch needs | |
| strand = { 'plus' : 1, 'minus' : 2 } | |
| #This will contain the names of genes and organism, indexed by accession number. | |
| #Used for the ones that don't produce named proteins. | |
| names = {} | |
| #List of accession numbers | |
| accs = [] | |
| #List of starting nucleotide to query | |
| frmL = [] | |
| #List of ending nucleotide to query | |
| toL = [] | |
| #List of strand (plus or minus) to query | |
| sndL = [] | |
| #Iterate through the records (the genes) | |
| for r in records : | |
| for loc in r['Entrezgene_locus'] : | |
| #Get the accession number, version, organism name, gene name, | |
| if( loc.has_key( 'Gene-commentary_accession' ) ) : | |
| acc = loc['Gene-commentary_accession'] | |
| ver = loc['Gene-commentary_version'] | |
| org = r['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname'] | |
| org = "_".join( org.split(" ") ) | |
| gen = r['Entrezgene_gene']['Gene-ref']['Gene-ref_locus'] | |
| #The sequence's interval to obtain | |
| sin = loc['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval'] | |
| names[ acc + "." + ver ] = { 'gene' : gen, 'species' : org } | |
| #Store the information for later querying | |
| frmL.append( sin['Seq-interval_from'] ) | |
| toL.append( sin['Seq-interval_to'] ) | |
| sndL.append( strand[ sin['Seq-interval_strand']['Na-strand'].attributes['value'] ] ) | |
| accs.append( acc + "." + ver ) | |
| else : | |
| #If it was not found, print an information message so we can check by hand what went wrong. | |
| print "****", gen + "_" + org, ":", loc | |
| #Save the sequences to GenBank format | |
| out = open( "seqs.gb", "w" ) | |
| #Make sure we don't have too many sequences! Entrez dislikes people downloading more than 100 seqs in a day. | |
| if( len(accs) > 100 ): | |
| print "WARNING: Downloading more than 100 sequences. Please make sure you don't violate Entrez limits! | |
| time.sleep(10) | |
| for i in range( len(accs) ) : | |
| print "Retrieving", accs[i] | |
| handle = Entrez.efetch(db="nucleotide", id=accs[i], seq_start=frmL[i], seq_stop=toL[i], strand=sndL[i], retmode="gbwithparts") | |
| #These queries can be very large, so dump them line by line. | |
| for l in handle : | |
| out.write(l) | |
| handle.close() | |
| #Respect Entrez limit of 3 queries a second (actually we are being generous with one a second) | |
| time.sleep(1) | |
| out.close() | |
| #These lists will contain the mRNA, CDS and protein data extracted from the genes. | |
| lstRNA = [] | |
| lstCDS = [] | |
| lstPRT = [] | |
| #Parse or newly downloaded sequences. | |
| rs = SeqIO.parse( "seqs.gb", "gb" ) | |
| for r in rs : | |
| #In case of duplicity, isoforms and the like, we will append these indices to the IDs | |
| i = 1 | |
| j = 1 | |
| #This should never happen, but if the data downloaded has references to other data, we will simply ignore the error! | |
| if( r.annotations.has_key( 'contig' ) ): | |
| print "****", r.annotations['contig'] | |
| #mRNA and CDS info is stored in the features. | |
| for ft in r.features : | |
| #Sometimes the data is not clean and attempting to translate or transcribe it yields non-sense or mis-sense errors. | |
| #Simply refuse to process this data, but print an error so we can check it manually | |
| if ft.qualifiers.has_key('exception') : | |
| print "****", ft.qualifiers['gene'], r.annotations['organism'], ft.qualifiers['exception'] | |
| continue | |
| if ft.type == "mRNA" : | |
| s = ft.extract( r ) | |
| #For annotation purposes, add as much info as is available to the record. | |
| if( ft.qualifiers.has_key('product') ) : | |
| s.description = str( ft.qualifiers['product'] ) | |
| if( ft.qualifiers.has_key('note') ) : | |
| s.description += " " + str( ft.qualifiers['note'] ) | |
| #Build an ID using <gene_name>_<species>_<i> whenever this info is available. | |
| #If not, use the info obtained before in 'names'. | |
| if( ft.qualifiers.has_key('gene') ) : | |
| s.id = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(i) | |
| else : | |
| s.id = "_".join(names[r.id].values()) + "_mRNA" + str(i) | |
| i += 1 | |
| #Append the record to the mRNA list. | |
| lstRNA.append( s ) | |
| if ft.type == "CDS" : | |
| s = ft.extract( r ) | |
| #For annotation purposes, add as much info as is available to the record. | |
| if( ft.qualifiers.has_key('product') ) : | |
| s.description = str( ft.qualifiers['product'] ) | |
| if( ft.qualifiers.has_key('note') ) : | |
| s.description += " " + str( ft.qualifiers['note'] ) | |
| #Build an ID using <gene_name>_<species>_<i> whenever this info is available. | |
| #If not, use the info obtained before in 'names'. | |
| if( ft.qualifiers.has_key('gene') ) : | |
| sid = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(j) | |
| else : | |
| sid = "_".join(names[r.id].values()) | |
| s.id = sid + "_CDS" + str(j) | |
| #Append the record to the CDS list. | |
| lstCDS.append( s ) | |
| #Append the translated record to the protein list. | |
| lstPRT.append( SeqRecord( s.seq.translate(to_stop=True), id=sid + "_" + str(j), name=s.name, description=s.description ) ) | |
| j += 1 | |
| #Finally, write out the sequences to file | |
| SeqIO.write( lstRNA, "mRNA.fasta", "fasta" ) | |
| SeqIO.write( lstCDS, "CDS.fasta", "fasta" ) | |
| SeqIO.write( lstPRT, "protein.fasta", "fasta" ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment