eweitz · April 25, 2017 15:33
diff --git a/chr_rs_acc_via_eutils.py b/chr_rs_acc_via_eutils.py
 '''
 This simple script shows how to use NCBI E-Utilies to get a chromosome's
 RefSeq accession given the chromosome's name and its genome assembly.

 Example:
 $ python3 chr_rs_acc_via_eutils.py
 RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26):
 NC_000006.12
 '''

 from urllib.request import urlopen
 import json

 chr_name = '6'
 asm_name = 'GRCh38'

 debug = False

 def load_json(response):
    return json.loads(response.read().decode('utf-8'))

 # The E-Utilies In Depth: Parameters, Syntax and More:
 # https://www.ncbi.nlm.nih.gov/books/NBK25499/
 eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
 esearch = eutils + 'esearch.fcgi?retmode=json'
 esummary = eutils + 'esummary.fcgi?retmode=json'
 elink = eutils + 'elink.fcgi?retmode=json'

 asm_search = esearch +  '&db=assembly&term=%22' + asm_name + '%22[name]'

 if debug:
    print('Fetching ' + asm_search)
 with urlopen(asm_search) as f:
    data = load_json(f)

 # Get NCBI Assembly database's internal identifier (uid) for this assembly
 asm_uid = data['esearchresult']['idlist'][0]
 asm_summary = esummary + '&db=assembly&id=' + asm_uid

 if debug:
    print('Fetching ' + asm_summary)
 with urlopen(asm_summary) as f:
    data = load_json(f)

 # Get RefSeq UID for this assembly
 rs_uid = data['result'][asm_uid]['rsuid']
 asm_acc = data['result'][asm_uid]['assemblyaccession']

 # Get a list of IDs for the chromosomes in this genome.
 # This information does not seem to be available from well-known NCBI databases
 # like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database.
 qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid
 nuccore_link = elink + qs

 if debug:
    print('Fetching ' + nuccore_link)
 with urlopen(nuccore_link) as f:
    data = load_json(f)

 links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links'])
 nt_summary = esummary + '&db=nucleotide&id=' + links

 if debug:
    print('Fetching ' + nt_summary)
 with urlopen(nt_summary) as f:
    data = load_json(f)

 results = data['result']

 for key in results:
    if key == 'uids':
        continue
    result = results[key]

    try:
        cn_index = result['subtype'].split('|').index('chromosome')
    except ValueError as e:
        cn_index = 1

    try:
        this_chr_name = result['subname'].split('|')[cn_index]
    except IndexError:
        continue

    if this_chr_name != None and this_chr_name[:3] == 'chr':
      # Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2)
      this_chr_name = this_chr_name[3:]

    if this_chr_name == chr_name:
        print(
            'RefSeq accession for chromosome ' + chr_name + ' on ' +
            'genome assembly ' + asm_name + ' (' + asm_acc + '):'
        )
        print(result['accessionversion'])
	'''
	This simple script shows how to use NCBI E-Utilies to get a chromosome's
	RefSeq accession given the chromosome's name and its genome assembly.

	Example:
	$ python3 chr_rs_acc_via_eutils.py
	RefSeq accession for chromosome 6 on genome assembly GRCh38 (GCF_000001405.26):
	NC_000006.12
	'''

	from urllib.request import urlopen
	import json

	chr_name = '6'
	asm_name = 'GRCh38'

	debug = False

	def load_json(response):
	return json.loads(response.read().decode('utf-8'))

	# The E-Utilies In Depth: Parameters, Syntax and More:
	# https://www.ncbi.nlm.nih.gov/books/NBK25499/
	eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
	esearch = eutils + 'esearch.fcgi?retmode=json'
	esummary = eutils + 'esummary.fcgi?retmode=json'
	elink = eutils + 'elink.fcgi?retmode=json'

	asm_search = esearch + '&db=assembly&term=%22' + asm_name + '%22[name]'

	if debug:
	print('Fetching ' + asm_search)
	with urlopen(asm_search) as f:
	data = load_json(f)

	# Get NCBI Assembly database's internal identifier (uid) for this assembly
	asm_uid = data['esearchresult']['idlist'][0]
	asm_summary = esummary + '&db=assembly&id=' + asm_uid

	if debug:
	print('Fetching ' + asm_summary)
	with urlopen(asm_summary) as f:
	data = load_json(f)

	# Get RefSeq UID for this assembly
	rs_uid = data['result'][asm_uid]['rsuid']
	asm_acc = data['result'][asm_uid]['assemblyaccession']

	# Get a list of IDs for the chromosomes in this genome.
	# This information does not seem to be available from well-known NCBI databases
	# like Assembly or Nucleotide, so we use GenColl, a lesser-known NCBI database.
	qs = '&db=nuccore&linkname=gencoll_nuccore_chr&from_uid=' + rs_uid
	nuccore_link = elink + qs

	if debug:
	print('Fetching ' + nuccore_link)
	with urlopen(nuccore_link) as f:
	data = load_json(f)

	links = ','.join(str(x) for x in data['linksets'][0]['linksetdbs'][0]['links'])
	nt_summary = esummary + '&db=nucleotide&id=' + links

	if debug:
	print('Fetching ' + nt_summary)
	with urlopen(nt_summary) as f:
	data = load_json(f)

	results = data['result']

	for key in results:
	if key == 'uids':
	continue
	result = results[key]

	try:
	cn_index = result['subtype'].split('\|').index('chromosome')
	except ValueError as e:
	cn_index = 1

	try:
	this_chr_name = result['subname'].split('\|')[cn_index]
	except IndexError:
	continue

	if this_chr_name != None and this_chr_name[:3] == 'chr':
	# Convert 'chr12' to '12', e.g. for banana (GCF_000313855.2)
	this_chr_name = this_chr_name[3:]

	if this_chr_name == chr_name:
	print(
	'RefSeq accession for chromosome ' + chr_name + ' on ' +
	'genome assembly ' + asm_name + ' (' + asm_acc + '):'
	)
	print(result['accessionversion'])