Last active
September 18, 2019 15:05
-
-
Save pgsin/e360ca75880ad5f8ad1afe319e734989 to your computer and use it in GitHub Desktop.
Get GenBank record of all strains of specific bacteria species
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Bio import Entrez | |
import os | |
import gzip | |
""" | |
To avoid problems with an access rate (ex. "HTTP Error 429: Too Many Requests") | |
Read how to get an api-key | |
https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ | |
""" | |
Entrez.api_key = '<your api key here>' | |
Entrez.email = '<your email here>' | |
Entrez.tool = 'Demoscript' | |
def getStrainSequences(generic_name, specific_name, genome_dir): | |
""" | |
:param generic_name: the first part of species name | |
:param specific_name: the second part of species name | |
:return: return a dictionary of strains (assembly entry) for this species. AssemblyAccession -> [id0, id1, ...] | |
""" | |
assembly_accessions = {} | |
species_name = generic_name + " " + specific_name | |
count_info_entry = \ | |
Entrez.esearch( | |
db="assembly", | |
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND ' | |
'( all[filter] NOT anomalous[filter] ) )'.format(species_name), | |
rettype='count') | |
assembly_entry = \ | |
Entrez.esearch( | |
db="assembly", | |
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND ' | |
'( all[filter] NOT anomalous[filter] ) )'.format(species_name), | |
RetMax=Entrez.read(count_info_entry)['Count'], | |
rettype='uilist') | |
assembly_ids = Entrez.read(assembly_entry)['IdList'] | |
# print("assembly_ids: " + str(assembly_ids)) | |
for assembly_id in assembly_ids: | |
summary = \ | |
Entrez.esummary( | |
db="assembly", | |
id=assembly_id) | |
assembly_accession = Entrez.read(summary)['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession'] | |
assembly_accessions[assembly_accession] = [] | |
# print("assembly_accession: " + str(assembly_accession)) | |
assembly_accession_entry = \ | |
Entrez.esearch( | |
db="nucleotide", | |
term="{}[Assembly]".format(assembly_accession), | |
rettype='uilist') | |
assembly_accession_entry_value = Entrez.read(assembly_accession_entry) | |
# print("assembly_accession_entry_value: " + str(assembly_accession_entry_value)) | |
for identity in assembly_accession_entry_value['IdList']: | |
assembly_accessions[assembly_accession].append(identity) | |
gb_file = os.path.join(genome_dir, identity + ".gb.gz") | |
if not os.path.exists(gb_file): | |
gb_entry = \ | |
Entrez.efetch( | |
db="nucleotide", | |
id=identity, | |
rettype="gbwithparts", | |
retmode="text") | |
with gzip.open(gb_file, 'wb') as fs: | |
fs.write(gb_entry.read().encode()) | |
print("Downloading is done: " + assembly_accession) | |
return assembly_accessions | |
genome_dir = "streptococcus_genomes" | |
if not os.path.exists(genome_dir): | |
os.mkdir(genome_dir) | |
result = {} | |
with open('streprococcus.txt') as fs: | |
for line in fs: | |
l = line.rstrip().split() | |
result[line.rstrip()] = getStrainSequences(l[0], l[1], genome_dir) | |
with open(os.path.join(genome_dir, "result.txt"), "w") as fs: | |
fs.write(str(result)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Streptococcus acidominimus | |
Streptococcus agalactiae | |
Streptococcus alactolyticus | |
Streptococcus anginosus | |
Streptococcus australis | |
Streptococcus bovis | |
Streptococcus caballi | |
Streptococcus cameli | |
Streptococcus canis | |
Streptococcus caprae | |
Streptococcus castoreus | |
Streptococcus criceti | |
Streptococcus constellatus | |
Streptococcus cuniculi | |
Streptococcus danieliae | |
Streptococcus dentasini | |
Streptococcus dentiloxodontae | |
Streptococcus dentirousetti | |
Streptococcus devriesei | |
Streptococcus didelphis | |
Streptococcus downei | |
Streptococcus dysgalactiae | |
Streptococcus entericus | |
Streptococcus equi | |
Streptococcus equinus | |
Streptococcus ferus | |
Streptococcus gallinaceus | |
Streptococcus gallolyticus | |
Streptococcus gordonii | |
Streptococcus halichoeri | |
Streptococcus halotolerans | |
Streptococcus henryi | |
Streptococcus himalayensis | |
Streptococcus hongkongensis | |
Streptococcus hyointestinalis | |
Streptococcus hyovaginalis | |
Streptococcus ictaluri | |
Streptococcus infantarius | |
Streptococcus infantis | |
Streptococcus iniae | |
Streptococcus intermedius | |
Streptococcus lactarius | |
Streptococcus loxodontisalivarius | |
Streptococcus lutetiensis | |
Streptococcus macacae | |
Streptococcus marimammalium | |
Streptococcus marmotae | |
Streptococcus massiliensis | |
Streptococcus merionis | |
Streptococcus minor | |
Streptococcus mitis | |
Streptococcus moroccensis | |
Streptococcus mutans | |
Streptococcus oligofermentans | |
Streptococcus oralis | |
Streptococcus oricebi | |
Streptococcus oriloxodontae | |
Streptococcus orisasini | |
Streptococcus orisratti | |
Streptococcus orisuis | |
Streptococcus ovis | |
Streptococcus panodentis | |
Streptococcus pantholopis | |
Streptococcus parasanguinis | |
Streptococcus parasuis | |
Streptococcus parauberis | |
Streptococcus peroris | |
Streptococcus pharyngis | |
Streptococcus phocae | |
Streptococcus pluranimalium | |
Streptococcus plurextorum | |
Streptococcus pneumoniae | |
Streptococcus porci | |
Streptococcus porcinus | |
Streptococcus porcorum | |
Streptococcus pseudopneumoniae | |
Streptococcus pseudoporcinus | |
Streptococcus pyogenes | |
Streptococcus ratti | |
Streptococcus rifensis | |
Streptococcus rubneri | |
Streptococcus rupicaprae | |
Streptococcus salivarius | |
Streptococcus saliviloxodontae | |
Streptococcus sanguinis | |
Streptococcus sinensis | |
Streptococcus sobrinus | |
Streptococcus suis | |
Streptococcus tangierensis | |
Streptococcus thoraltensis | |
Streptococcus troglodytae | |
Streptococcus troglodytidis | |
Streptococcus tigurinus | |
Streptococcus thermophilus | |
Streptococcus uberis | |
Streptococcus urinalis | |
Streptococcus ursoris | |
Streptococcus vestibularis | |
Streptococcus zooepidemicus |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment