Created
April 25, 2017 15:33
-
-
Save eweitz/20ee1d7035e103f7533ed34f81bfe35d to your computer and use it in GitHub Desktop.
NCBI EUtils demo: get a chromosome's RefSeq accession given its name and assembly
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This simple script shows how to use NCBI E-Utilies to get a chromosome's | |
RefSeq accession given the chromosome's name and its genome assembly. | |
Example: | |
$ python3 chr_rs_acc_via_eutils.py | |
RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26): | |
NC_000006.12 | |
''' | |
from urllib.request import urlopen | |
import json | |
chr_name = '6' | |
asm_name = 'GRCh38' | |
debug = False | |
def load_json(response): | |
return json.loads(response.read().decode('utf-8')) | |
# The E-Utilies In Depth: Parameters, Syntax and More: | |
# https://www.ncbi.nlm.nih.gov/books/NBK25499/ | |
eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' | |
esearch = eutils + 'esearch.fcgi?retmode=json' | |
esummary = eutils + 'esummary.fcgi?retmode=json' | |
elink = eutils + 'elink.fcgi?retmode=json' | |
asm_search = esearch + '&db=assembly&term=%22' + asm_name + '%22[name]' | |
if debug: | |
print('Fetching ' + asm_search) | |
with urlopen(asm_search) as f: | |
data = load_json(f) | |
# Get NCBI Assembly database's internal identifier (uid) for this assembly | |
asm_uid = data['esearchresult']['idlist'][0] | |
asm_summary = esummary + '&db=assembly&id=' + asm_uid | |
if debug: | |
print('Fetching ' + asm_summary) | |
with urlopen(asm_summary) as f: | |
data = load_json(f) | |
# Get RefSeq UID for this assembly | |
rs_uid = data['result'][asm_uid]['rsuid'] | |
asm_acc = data['result'][asm_uid]['assemblyaccession'] | |
# Get a list of IDs for the chromosomes in this genome. | |
# This information does not seem to be available from well-known NCBI databases | |
# like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database. | |
qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid | |
nuccore_link = elink + qs | |
if debug: | |
print('Fetching ' + nuccore_link) | |
with urlopen(nuccore_link) as f: | |
data = load_json(f) | |
links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links']) | |
nt_summary = esummary + '&db=nucleotide&id=' + links | |
if debug: | |
print('Fetching ' + nt_summary) | |
with urlopen(nt_summary) as f: | |
data = load_json(f) | |
results = data['result'] | |
for key in results: | |
if key == 'uids': | |
continue | |
result = results[key] | |
try: | |
cn_index = result['subtype'].split('|').index('chromosome') | |
except ValueError as e: | |
cn_index = 1 | |
try: | |
this_chr_name = result['subname'].split('|')[cn_index] | |
except IndexError: | |
continue | |
if this_chr_name != None and this_chr_name[:3] == 'chr': | |
# Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2) | |
this_chr_name = this_chr_name[3:] | |
if this_chr_name == chr_name: | |
print( | |
'RefSeq accession for chromosome ' + chr_name + ' on ' + | |
'genome assembly ' + asm_name + ' (' + asm_acc + '):' | |
) | |
print(result['accessionversion']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment