Created
October 15, 2017 21:08
-
-
Save fbidu/83e6f6a7675c1d9ced8f8b3595a5cf54 to your computer and use it in GitHub Desktop.
Rascunho de script para obter IDs de sequencias de DNA/RNA (NGS) armazenadas no SRA (https://trace.ncbi.nlm.nih.gov/Traces/sra/) com base em busca via Entrez
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xml.dom import minidom # Ou qualquer parser XML que você preferir | |
from Bio import Entrez | |
def search_sra(query, email='[email protected]'): | |
""" | |
Retorna um handler de busca para o SRA | |
""" | |
Entrez.email = email | |
handle = Entrez.esearch(db='sra', | |
sort='relevance', | |
retmax='20', | |
retmode='xml', | |
term=query) | |
results = Entrez.read(handle) | |
return results | |
def get_sra_details(id_list, email='[email protected]'): | |
""" | |
Pega detalhes de um conjunto de entradas do SRA | |
""" | |
ids = ','.join(id_list) | |
Entrez.email = email | |
handle = Entrez.efetch(db='sra', | |
retmode='xml', | |
id=ids) | |
# Atualmente o XML do SRA vem formatado errado | |
# Por isso não é possível usar a função Entrez.read ou .parse | |
results = minidom.parseString(handle.read()) | |
return results | |
def get_run_ids(xml_result): | |
# Obtendo o package set | |
package_set = xml_result.getElementsByTagName('EXPERIMENT_PACKAGE_SET')[0] # Assumindo unicidade de EXPERIMENT_PACKAGE_SET | |
ids = [] | |
# Para cada experimento no pacote | |
for experiment in package_set.getElementsByTagName('EXPERIMENT_PACKAGE'): | |
# Isolar o Run Set | |
run_set = experiment.getElementsByTagName('RUN_SET')[0] # Assumindo unicidade de RUN_SET | |
# Para cada corrida no Run Set | |
for run in run_set.getElementsByTagName('RUN'): | |
# Isolar o Identificador | |
identifier = run.getElementsByTagName('IDENTIFIERS')[0] # Assumindo unicidade de IDENTIFIERS | |
# Isolar o ID primário | |
primary_id = identifier.getElementsByTagName('PRIMARY_ID')[0] # Assumindo unicidade de PRIMARY_ID | |
# Obter o valor do node - Provavelmente no formato (S|E)RR0000000 | |
primary_id = primary_id.childNodes[0].nodeValue | |
ids.append(primary_id) | |
return ids | |
srch = search_sra('Human AND RNA') | |
id_list = srch['IdList'] | |
details = get_sra_details(id_list) | |
run_ids = get_run_ids(details) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment