radaniba · November 29, 2012 17:06
diff --git a/getnucseq.py b/getnucseq.py
 from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq
 from Bio import Entrez
 from Bio import SeqIO
 import time

 # Obtain your gene IDs from somewhere (file, past text directly, etc.)
 ids = []
 ids = ",".join( lines )

 # From Biopython: Always tell Entrez qho you are!
 Entrez.email = "my_mail@world.com"
 handle = Entrez.efetch(db="gene", id=ids, retmode="xml")

 #Save this query to file, so we don't have to download it again.
 out = open( "output.xml", "w")
 out.write(handle.read())
 out.close()
 handle.close()

 #Parse the XML file into gene records
 records = Entrez.read( open("output.xml","r") )

 #This dictionary maps verbose strand identification to the argument that efetch needs
 strand = { 'plus' : 1, 'minus' : 2 }
 #This will contain the names of genes and organism, indexed by accession number.
 #Used for the ones that don't produce named proteins.
 names = {}
 #List of accession numbers
 accs = []
 #List of starting nucleotide to query
 frmL = []
 #List of ending nucleotide to query
 toL  = []
 #List of strand (plus or minus) to query
 sndL = []

 #Iterate through the records (the genes)
 for r in records :
  for loc in r['Entrezgene_locus'] :
    #Get the accession number, version, organism name, gene name, 
    if( loc.has_key( 'Gene-commentary_accession' ) ) :
      acc = loc['Gene-commentary_accession']
      ver = loc['Gene-commentary_version']
      org = r['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']
      org = "_".join( org.split(" ") )
      gen = r['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']
      #The sequence's interval to obtain
      sin = loc['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']
      names[ acc + "." + ver ] = { 'gene' : gen, 'species' : org }
      #Store the information for later querying
      frmL.append( sin['Seq-interval_from'] )
      toL.append( sin['Seq-interval_to'] )
      sndL.append( strand[ sin['Seq-interval_strand']['Na-strand'].attributes['value'] ] )
      accs.append( acc + "." + ver )
    else :
      #If it was not found, print an information message so we can check by hand what went wrong.
      print "****", gen + "_" + org, ":", loc

 #Save the sequences to GenBank format
 out = open( "seqs.gb", "w" )

 #Make sure we don't have too many sequences! Entrez dislikes people downloading more than 100 seqs in a day.
 if( len(accs) > 100 ):
  print "WARNING: Downloading more than 100 sequences. Please make sure you don't violate Entrez limits!
  time.sleep(10)

 for i in range( len(accs) ) :
  print "Retrieving", accs[i]
  handle = Entrez.efetch(db="nucleotide", id=accs[i], seq_start=frmL[i], seq_stop=toL[i], strand=sndL[i], retmode="gbwithparts")
  #These queries can be very large, so dump them line by line.
  for l in handle :
    out.write(l)
  handle.close()
  #Respect Entrez limit of 3 queries a second (actually we are being generous with one a second)
  time.sleep(1)

 out.close()

 #These lists will contain the mRNA, CDS and protein data extracted from the genes.
 lstRNA = []
 lstCDS = []
 lstPRT = []

 #Parse or newly downloaded sequences.
 rs = SeqIO.parse( "seqs.gb", "gb" )
 for r in rs :
  #In case of duplicity, isoforms and the like, we will append these indices to the IDs
  i = 1
  j = 1
  #This should never happen, but if the data downloaded has references to other data, we will simply ignore the error!
  if( r.annotations.has_key( 'contig' ) ):
    print "****", r.annotations['contig']
  #mRNA and CDS info is stored in the features.
  for ft in r.features :
    #Sometimes the data is not clean and attempting to translate or transcribe it yields non-sense or mis-sense errors.
    #Simply refuse to process this data, but print an error so we can check it manually
    if ft.qualifiers.has_key('exception') :
      print "****", ft.qualifiers['gene'], r.annotations['organism'], ft.qualifiers['exception']
      continue
    if ft.type == "mRNA" :
      s = ft.extract( r )
      #For annotation purposes, add as much info as is available to the record.
      if( ft.qualifiers.has_key('product') ) :
        s.description = str( ft.qualifiers['product'] )
        if( ft.qualifiers.has_key('note') ) :
           s.description += " " + str( ft.qualifiers['note'] )
      #Build an ID using <gene_name>_<species>_<i> whenever this info is available.
      #If not, use the info obtained before in 'names'.
      if( ft.qualifiers.has_key('gene') ) :
        s.id = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(i)
      else :
        s.id = "_".join(names[r.id].values()) + "_mRNA" + str(i)
      i += 1
      #Append the record to the mRNA list.
      lstRNA.append( s )
    if ft.type == "CDS" :
      s = ft.extract( r )
      #For annotation purposes, add as much info as is available to the record.
      if( ft.qualifiers.has_key('product') ) :
        s.description = str( ft.qualifiers['product'] )
        if( ft.qualifiers.has_key('note') ) :
           s.description += " " + str( ft.qualifiers['note'] )
      #Build an ID using <gene_name>_<species>_<i> whenever this info is available.
      #If not, use the info obtained before in 'names'.
      if( ft.qualifiers.has_key('gene') ) :
        sid = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(j)
      else :
        sid = "_".join(names[r.id].values())
      s.id = sid + "_CDS" + str(j)
      #Append the record to the CDS list.
      lstCDS.append( s )
      #Append the translated record to the protein list.
      lstPRT.append( SeqRecord( s.seq.translate(to_stop=True), id=sid + "_" + str(j), name=s.name, description=s.description ) )
      j += 1

 #Finally, write out the sequences to file
 SeqIO.write( lstRNA, "mRNA.fasta", "fasta" )

 SeqIO.write( lstCDS, "CDS.fasta", "fasta" )

 SeqIO.write( lstPRT, "protein.fasta", "fasta" )
	from Bio.SeqRecord import SeqRecord
	from Bio.Seq import Seq
	from Bio import Entrez
	from Bio import SeqIO
	import time

	# Obtain your gene IDs from somewhere (file, past text directly, etc.)
	ids = []
	ids = ",".join( lines )

	# From Biopython: Always tell Entrez qho you are!
	Entrez.email = "my_mail@world.com"
	handle = Entrez.efetch(db="gene", id=ids, retmode="xml")

	#Save this query to file, so we don't have to download it again.
	out = open( "output.xml", "w")
	out.write(handle.read())
	out.close()
	handle.close()

	#Parse the XML file into gene records
	records = Entrez.read( open("output.xml","r") )

	#This dictionary maps verbose strand identification to the argument that efetch needs
	strand = { 'plus' : 1, 'minus' : 2 }
	#This will contain the names of genes and organism, indexed by accession number.
	#Used for the ones that don't produce named proteins.
	names = {}
	#List of accession numbers
	accs = []
	#List of starting nucleotide to query
	frmL = []
	#List of ending nucleotide to query
	toL = []
	#List of strand (plus or minus) to query
	sndL = []

	#Iterate through the records (the genes)
	for r in records :
	for loc in r['Entrezgene_locus'] :
	#Get the accession number, version, organism name, gene name,
	if( loc.has_key( 'Gene-commentary_accession' ) ) :
	acc = loc['Gene-commentary_accession']
	ver = loc['Gene-commentary_version']
	org = r['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']
	org = "_".join( org.split(" ") )
	gen = r['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']
	#The sequence's interval to obtain
	sin = loc['Gene-commentary_seqs'][0]['Seq-loc_int']['Seq-interval']
	names[ acc + "." + ver ] = { 'gene' : gen, 'species' : org }
	#Store the information for later querying
	frmL.append( sin['Seq-interval_from'] )
	toL.append( sin['Seq-interval_to'] )
	sndL.append( strand[ sin['Seq-interval_strand']['Na-strand'].attributes['value'] ] )
	accs.append( acc + "." + ver )
	else :
	#If it was not found, print an information message so we can check by hand what went wrong.
	print "****", gen + "_" + org, ":", loc

	#Save the sequences to GenBank format
	out = open( "seqs.gb", "w" )

	#Make sure we don't have too many sequences! Entrez dislikes people downloading more than 100 seqs in a day.
	if( len(accs) > 100 ):
	print "WARNING: Downloading more than 100 sequences. Please make sure you don't violate Entrez limits!
	time.sleep(10)

	for i in range( len(accs) ) :
	print "Retrieving", accs[i]
	handle = Entrez.efetch(db="nucleotide", id=accs[i], seq_start=frmL[i], seq_stop=toL[i], strand=sndL[i], retmode="gbwithparts")
	#These queries can be very large, so dump them line by line.
	for l in handle :
	out.write(l)
	handle.close()
	#Respect Entrez limit of 3 queries a second (actually we are being generous with one a second)
	time.sleep(1)

	out.close()

	#These lists will contain the mRNA, CDS and protein data extracted from the genes.
	lstRNA = []
	lstCDS = []
	lstPRT = []

	#Parse or newly downloaded sequences.
	rs = SeqIO.parse( "seqs.gb", "gb" )
	for r in rs :
	#In case of duplicity, isoforms and the like, we will append these indices to the IDs
	i = 1
	j = 1
	#This should never happen, but if the data downloaded has references to other data, we will simply ignore the error!
	if( r.annotations.has_key( 'contig' ) ):
	print "****", r.annotations['contig']
	#mRNA and CDS info is stored in the features.
	for ft in r.features :
	#Sometimes the data is not clean and attempting to translate or transcribe it yields non-sense or mis-sense errors.
	#Simply refuse to process this data, but print an error so we can check it manually
	if ft.qualifiers.has_key('exception') :
	print "****", ft.qualifiers['gene'], r.annotations['organism'], ft.qualifiers['exception']
	continue
	if ft.type == "mRNA" :
	s = ft.extract( r )
	#For annotation purposes, add as much info as is available to the record.
	if( ft.qualifiers.has_key('product') ) :
	s.description = str( ft.qualifiers['product'] )
	if( ft.qualifiers.has_key('note') ) :
	s.description += " " + str( ft.qualifiers['note'] )
	#Build an ID using <gene_name>_<species>_<i> whenever this info is available.
	#If not, use the info obtained before in 'names'.
	if( ft.qualifiers.has_key('gene') ) :
	s.id = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(i)
	else :
	s.id = "_".join(names[r.id].values()) + "_mRNA" + str(i)
	i += 1
	#Append the record to the mRNA list.
	lstRNA.append( s )
	if ft.type == "CDS" :
	s = ft.extract( r )
	#For annotation purposes, add as much info as is available to the record.
	if( ft.qualifiers.has_key('product') ) :
	s.description = str( ft.qualifiers['product'] )
	if( ft.qualifiers.has_key('note') ) :
	s.description += " " + str( ft.qualifiers['note'] )
	#Build an ID using <gene_name>_<species>_<i> whenever this info is available.
	#If not, use the info obtained before in 'names'.
	if( ft.qualifiers.has_key('gene') ) :
	sid = ft.qualifiers['gene'][0] + "_" + names[r.id]['species'] + "_" + str(j)
	else :
	sid = "_".join(names[r.id].values())
	s.id = sid + "_CDS" + str(j)
	#Append the record to the CDS list.
	lstCDS.append( s )
	#Append the translated record to the protein list.
	lstPRT.append( SeqRecord( s.seq.translate(to_stop=True), id=sid + "_" + str(j), name=s.name, description=s.description ) )
	j += 1

	#Finally, write out the sequences to file
	SeqIO.write( lstRNA, "mRNA.fasta", "fasta" )

	SeqIO.write( lstCDS, "CDS.fasta", "fasta" )

	SeqIO.write( lstPRT, "protein.fasta", "fasta" )
No results found