Skip to content

Instantly share code, notes, and snippets.

@brantfaircloth
Created April 3, 2011 23:50
Show Gist options
  • Save brantfaircloth/900938 to your computer and use it in GitHub Desktop.
Save brantfaircloth/900938 to your computer and use it in GitHub Desktop.
Get protein sequences from Genbank given a genomic accession number and a gene name
import sys
import time
from Bio import Entrez
Entrez.email = "[email protected]"
if not Entrez.email:
print "you must add your email address"
sys.exit(2)
# create an empty list we will fill with the gene names
genes = []
# open the file and begin to iterate over the lines
for line in open('genes.txt', 'r'):
# at each line, get the value, strip the line ending
# (\n) from the string, and add the gene name
# to our list
genes.append(line.strip('\n'))
print "genes: ", genes
# create an empty list we will fill with the gene names
species = []
# open the file and begin to iterate over the lines
for line in open('species.txt', 'r'):
# at each line, get the value, strip the line ending
# (\n), split the string wherever there are '|' characters
# and add the fourth element (the GI) of the split string
# to our species list
name = line.strip('\n').split('|')[3]
species.append(name)
print "species: ", species
# for each gene in the gene list
for gene in genes:
# Print out the gene name
print "gene: ", gene
# Open a gene-specific output file, to which we will write our
# results. The string substitution makes the output file name
# start with the name of the each gene, as we iterate
# `for gene in genes`
output = open('{0}.fasta'.format(gene), 'w')
# for each taxon in the species list
for taxon in species:
try:
# get rid of the taxon version number
taxon = taxon.split('.')[0]
# print the taxon name. The "\t" means print a tab
# preceding the taxon name
print "\ttaxon: ", taxon
# We know the *name* of the gene that we want, and the
# accession number of the species in which it is
# found, but we do not know the GI number of the
# gene (and protein product) we want. So we need to
# get that first, since it is the best reference to
# other items in NCBI, like the protein sequence.
# So, we first build a search string for an Entrez
# search query. The result will contain the GI of
# the given gene in the given species.
#
# Substitute the taxon and gene name into the string
terms = "{0}[accn] AND {1}[Gene]".format(taxon, gene)
# Use the biopython Entrez class and esearch method to
# search the Gene db using the terms we've defined
# above. Entrez Esearch's function is to return
# primary identifier (GIs) of records. The results
# are returned as XML.
result = Entrez.esearch(db = 'Gene', term=terms)
# Parse the XML using `read()` method of the Entrez
# class
record = Entrez.read(result)
# Get the GI from the first (and only, hopefully)
# record that was returned.
gi = record["IdList"][0]
# Now that we have the GI number, let's get the actual
# record for the gene which is similar to a page like
# this (but in XML versus HTML):
#
# http://www.ncbi.nlm.nih.gov/gene/4025118
gene_record = Entrez.efetch(db="gene", id=gi, retmode="xml")
# Parse the XML using `read()` method of the Entrez
# class
xml = Entrez.read(gene_record)
# The `read()` method parses the returned XML to give
# us many, many nested dictionaries. This is slightly
# confusing because the structure returned actually
# contains lists nested within dictionaries (for a
# number of reasons). Something like this:
#
# xml = {
# [
# {'record':
# [
# {'name1':value}
# ]
# }
# ]
# }
#
# So, what we are doing below is getting the first
# element of every list then using the "key" that we
# know (e.g. "Entrezgene_locus") to return the value
# it is associated with, which is usually another
# list. Then, we get the first (0th) element of that
# list, which is a dictionary. Then, we get use the
# "key" ('Gene-commentary_products') to get the next
# "value", and so forth.
products = xml[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]
# Ensure we've grabbed the correct thing by making
# sure that the "type" entry of the item we are on is
# "peptide".
assert products['Gene-commentary_type'].attributes['value']== 'peptide'
# Get the accession number for the protein
prot_accession = products['Gene-commentary_accession']
# Print the accession number, preceded by two tab
# characters
print "\t\taccession: ", prot_accession
# Get the GI of the protein - this is in a different
# place that the Accession number - so we traverse
# the hierarchy again
prot_gi = xml[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]['Gene-commentary_seqs'][0]['Seq-loc_whole']['Seq-id']['Seq-id_gi']
print "\t\tprotein GI: ", prot_gi
# Finally, use the `efetch()` method of Entrez to grab
# the fasta sequence of the protein we're after.
seq = Entrez.efetch(db="protein", id = prot_gi, retmod='text', rettype='fasta')
# In one fell swoop, read in the result, and write it
# out to our gene-specific file
output.write(seq.read().replace('\n\n', '\n'))
# Sleep for 0.3 seconds, to keep NCBI servers happy
time.sleep(0.3)
except IndexError:
error = "Unable to locate record.\n"
print error
output.write(error)
# Sleep for 0.3 seconds, to keep NCBI servers happy
time.sleep(0.3)
# Flush the buffer (push the results into the file and close
# the file. This essentially "writes" the results to the file.
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment