Skip to content

Instantly share code, notes, and snippets.

@fbidu
Created October 15, 2017 21:08
Show Gist options
  • Save fbidu/83e6f6a7675c1d9ced8f8b3595a5cf54 to your computer and use it in GitHub Desktop.
Save fbidu/83e6f6a7675c1d9ced8f8b3595a5cf54 to your computer and use it in GitHub Desktop.
Rascunho de script para obter IDs de sequencias de DNA/RNA (NGS) armazenadas no SRA (https://trace.ncbi.nlm.nih.gov/Traces/sra/) com base em busca via Entrez
from xml.dom import minidom # Ou qualquer parser XML que você preferir
from Bio import Entrez
def search_sra(query, email='[email protected]'):
"""
Retorna um handler de busca para o SRA
"""
Entrez.email = email
handle = Entrez.esearch(db='sra',
sort='relevance',
retmax='20',
retmode='xml',
term=query)
results = Entrez.read(handle)
return results
def get_sra_details(id_list, email='[email protected]'):
"""
Pega detalhes de um conjunto de entradas do SRA
"""
ids = ','.join(id_list)
Entrez.email = email
handle = Entrez.efetch(db='sra',
retmode='xml',
id=ids)
# Atualmente o XML do SRA vem formatado errado
# Por isso não é possível usar a função Entrez.read ou .parse
results = minidom.parseString(handle.read())
return results
def get_run_ids(xml_result):
# Obtendo o package set
package_set = xml_result.getElementsByTagName('EXPERIMENT_PACKAGE_SET')[0] # Assumindo unicidade de EXPERIMENT_PACKAGE_SET
ids = []
# Para cada experimento no pacote
for experiment in package_set.getElementsByTagName('EXPERIMENT_PACKAGE'):
# Isolar o Run Set
run_set = experiment.getElementsByTagName('RUN_SET')[0] # Assumindo unicidade de RUN_SET
# Para cada corrida no Run Set
for run in run_set.getElementsByTagName('RUN'):
# Isolar o Identificador
identifier = run.getElementsByTagName('IDENTIFIERS')[0] # Assumindo unicidade de IDENTIFIERS
# Isolar o ID primário
primary_id = identifier.getElementsByTagName('PRIMARY_ID')[0] # Assumindo unicidade de PRIMARY_ID
# Obter o valor do node - Provavelmente no formato (S|E)RR0000000
primary_id = primary_id.childNodes[0].nodeValue
ids.append(primary_id)
return ids
srch = search_sra('Human AND RNA')
id_list = srch['IdList']
details = get_sra_details(id_list)
run_ids = get_run_ids(details)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment