Skip to content

Instantly share code, notes, and snippets.

@radaniba
Created November 29, 2012 17:06
Show Gist options
  • Select an option

  • Save radaniba/4170428 to your computer and use it in GitHub Desktop.

Select an option

Save radaniba/4170428 to your computer and use it in GitHub Desktop.
Obtaining the actual sequence making up a gene from NCBI is simple using a browser, but not so much when wanting to do it in batch. This script obtains the nucleotide sequences, mRNA's, CDS's and protein sequences associated with a list of gene IDs.
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import Entrez
from Bio import SeqIO
import time
# Obtain your gene IDs from somewhere (file, past text directly, etc.)
ids = []
ids = ",".join( lines )
# From Biopython: Always tell Entrez qho you are!
Entrez.email = "my_mail@world.com"
handle = Entrez.efetch(db="gene", id=ids, retmode="xml")
#Save this query to file, so we don't have to download it again.
out = open( "output.xml", "w")
out.write(handle.read())
out.close()
handle.close()
#Parse the XML file into gene records
records = Entrez.read( open("output.xml","r") )
#This dictionary maps verbose strand identification to the argument that efetch needs
strand = { 'plus' : 1, 'minus' : 2 }
#This will contain the names of genes and organism, indexed by accession number.
#Used for the ones that don't produce named proteins.
names = {}
#List of accession numbers
accs = []
#List of starting nucleotide to query
frmL = []
#List of ending nucleotide to query
toL = []
#List of strand (plus or minus) to query
sndL = []
#Iterate through the records (the genes)
for r in records :
for loc in r['Entrezgene_locus'] :
#Get the accession number, version, organism name, gene name,
if( loc.has_key( 'Gene-commentary_accession' ) ) :
acc = loc['Gene-commentary_accession']
ver = loc['Gene-commentary_version']
org = r['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']
org = "_".join( org.split(" ") )
gen = r['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']
#The sequence's interval to obtain
sin = loc['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']
names[ acc + "." + ver ] = { 'gene' : gen, 'species' : org }
#Store the information for later querying
frmL.append( sin['Seq-interval_from'] )
toL.append( sin['Seq-interval_to'] )
sndL.append( strand[ sin['Seq-interval_strand']['Na-strand'].attributes['value'] ] )
accs.append( acc + "." + ver )
else :
#If it was not found, print an information message so we can check by hand what went wrong.
print "****", gen + "_" + org, ":", loc
#Save the sequences to GenBank format
out = open( "seqs.gb", "w" )
#Make sure we don't have too many sequences! Entrez dislikes people downloading more than 100 seqs in a day.
if( len(accs) > 100 ):
print "WARNING: Downloading more than 100 sequences. Please make sure you don't violate Entrez limits!
time.sleep(10)
for i in range( len(accs) ) :
print "Retrieving", accs[i]
handle = Entrez.efetch(db="nucleotide", id=accs[i], seq_start=frmL[i], seq_stop=toL[i], strand=sndL[i], retmode="gbwithparts")
#These queries can be very large, so dump them line by line.
for l in handle :
out.write(l)
handle.close()
#Respect Entrez limit of 3 queries a second (actually we are being generous with one a second)
time.sleep(1)
out.close()
#These lists will contain the mRNA, CDS and protein data extracted from the genes.
lstRNA = []
lstCDS = []
lstPRT = []
#Parse or newly downloaded sequences.
rs = SeqIO.parse( "seqs.gb", "gb" )
for r in rs :
#In case of duplicity, isoforms and the like, we will append these indices to the IDs
i = 1
j = 1
#This should never happen, but if the data downloaded has references to other data, we will simply ignore the error!
if( r.annotations.has_key( 'contig' ) ):
print "****", r.annotations['contig']
#mRNA and CDS info is stored in the features.
for ft in r.features :
#Sometimes the data is not clean and attempting to translate or transcribe it yields non-sense or mis-sense errors.
#Simply refuse to process this data, but print an error so we can check it manually
if ft.qualifiers.has_key('exception') :
print "****", ft.qualifiers['gene'], r.annotations['organism'], ft.qualifiers['exception']
continue
if ft.type == "mRNA" :
s = ft.extract( r )
#For annotation purposes, add as much info as is available to the record.
if( ft.qualifiers.has_key('product') ) :
s.description = str( ft.qualifiers['product'] )
if( ft.qualifiers.has_key('note') ) :
s.description += " " + str( ft.qualifiers['note'] )
#Build an ID using <gene_name>_<species>_<i> whenever this info is available.
#If not, use the info obtained before in 'names'.
if( ft.qualifiers.has_key('gene') ) :
s.id = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(i)
else :
s.id = "_".join(names[r.id].values()) + "_mRNA" + str(i)
i += 1
#Append the record to the mRNA list.
lstRNA.append( s )
if ft.type == "CDS" :
s = ft.extract( r )
#For annotation purposes, add as much info as is available to the record.
if( ft.qualifiers.has_key('product') ) :
s.description = str( ft.qualifiers['product'] )
if( ft.qualifiers.has_key('note') ) :
s.description += " " + str( ft.qualifiers['note'] )
#Build an ID using <gene_name>_<species>_<i> whenever this info is available.
#If not, use the info obtained before in 'names'.
if( ft.qualifiers.has_key('gene') ) :
sid = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(j)
else :
sid = "_".join(names[r.id].values())
s.id = sid + "_CDS" + str(j)
#Append the record to the CDS list.
lstCDS.append( s )
#Append the translated record to the protein list.
lstPRT.append( SeqRecord( s.seq.translate(to_stop=True), id=sid + "_" + str(j), name=s.name, description=s.description ) )
j += 1
#Finally, write out the sequences to file
SeqIO.write( lstRNA, "mRNA.fasta", "fasta" )
SeqIO.write( lstCDS, "CDS.fasta", "fasta" )
SeqIO.write( lstPRT, "protein.fasta", "fasta" )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment