Created
November 4, 2016 10:38
-
-
Save diallobakary4/5f5bdb702e3f0ae034a396fb2218031e to your computer and use it in GitHub Desktop.
NCBI dataset extraction using
Eutilities and Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#NCBI data extraction using Eutilities and Python | |
# More details "https://www.ncbi.nlm.nih.gov/books/NBK25498/ | |
__author__ = "Bakary N'tji Diallo" | |
__email__ = "[email protected]" | |
import requests | |
#Eutils search pipeline building | |
# 7 Eutilies cgi | |
eutils_base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" #base url to access eutils tools | |
esearch = "esearch.fcgi?" # ESearch – ESummary/EFetch | |
efetch = "efetch.fcgi?" #EFetch to retrieve full records) | |
elink = "elink.fcgi?" # ELink – ESummary/Efetch | |
epost = "epost.fcgi" # EPost – ELink – ESummary/EFetch | |
esummary = "esummary.fcgi" | |
egquery = "egquery.fcgi" | |
einfo = "einfo.fcgi" | |
# Goal: Download all Mycobacterium tuberculosis mRNA sequences in FASTA format. | |
# Solution: First use ESearch to retrieve the GI numbers for these sequences | |
# and post them on the History server, then use multiple EFetch calls | |
# to retrieve the data (in batch if too much) | |
# Input: | |
# Output: A file named "Mtb_mRNAs.fna" containing FASTA data. | |
param = """db=nuccore&term=(Mycobacterium tuberculosis[Organism]) AND "Mycobacterium tuberculosis"[porgn:__txid1773] """ #parameters to specify your search | |
#esearch to have query and webenv params | |
url = eutils_base + esearch + param +"&usehistory=y" | |
print url | |
data = requests.get(url) | |
query_key = str(data.content[data.content.index("<QueryKey>") + 10 : data.content.index("</QueryKey>")]) | |
webenv = str(data.content[data.content.index("<WebEnv>") + 8 : data.content.index("</WebEnv>")]) | |
# detailled url structure | |
url = eutils_base +\ | |
efetch +\ | |
"db=nuccore" \ | |
+ "&query_key=" + query_key + "&WebEnv=" + webenv + \ | |
"&retstart=1&retmax=36" + "&rettype=fasta" + "&retmode=text"\ | |
+"&[email protected]" | |
data = requests.get(url) | |
#Open a .fna file, extract fasta data, and write them in the .fna file. | |
file_name = "Mtb_mRNAs.fna" | |
with open(file_name, "w") as file: | |
file.write(data.content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment