Skip to content

Instantly share code, notes, and snippets.

@amdevine
Last active March 24, 2022 17:40
Show Gist options
  • Select an option

  • Save amdevine/8789eb7bd2b3e7df4bc5dede8dd542a4 to your computer and use it in GitHub Desktop.

Select an option

Save amdevine/8789eb7bd2b3e7df4bc5dede8dd542a4 to your computer and use it in GitHub Desktop.
Real-time Name Query (CoL, GGBN, GenBank)
"""
This script reproduces the functionality of the Real-Time Name Query web app
found at https://www.globalgeno.me/gaps/live. It is intended to be run as
a script at the command line. The correct usage is:
python real_time_name_query.py <names file path> <NCBI API key>
e.g. python real_time_name_query.py querynames.txt 01234567890ABCDEFGHIK
Names files should be text files formatted with one name on each line.
A NCBI API key can be obtained by creating an account on the NCBI website.
**Note:
Species-level queries are not available for GGBN, and any GGBN results obtained for
species names will not be accurate.
"""
import argparse, progressbar, requests
import pandas as pd
def inCOL(tname):
"""Query Catalog of Life for taxonomic name and return boolean."""
url = 'http://api.catalogueoflife.org/name/matching'
params = {'q': tname, 'verbose': 'false'}
r = requests.get(url, params).json()
return r.get('type', 'none') == 'exact'
def inGGBN(tname):
"""Query GGBN for taxonomic name and return boolean."""
api_url = 'http://data.ggbn.org/ggbn_portal/api/search'
params = {
'getCounts': True,
'name': tname + "*"
}
ggbn = requests.get(api_url, params).json()
numResults = int(ggbn["nbSamples"])
return numResults > 0
def inGenBankAny(tname, ncbikey):
"""Query GenBank Eutils for any records, return count."""
api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
params = {
'db': 'nuccore',
'retmode': 'json',
'rettype': 'count',
'api_key': ncbikey,
'term': (tname + '[Organism]')
}
result = requests.get(api_url, params).json()
try:
return int(result['esearchresult']['count'])
except KeyError:
return 0
def inGenBankBarcode(tname, ncbikey):
"""Query GenBank Eutils for barcode records and return counts
for coi, rbcl, matk, its"""
api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
record_counts = {}
# COI, rbcL, matK
for gene in ['coi', 'cox1', 'rbcl', 'matk']:
params = {
'db': 'nuccore',
'retmode': 'json',
'rettype': 'count',
'api_key': ncbikey,
'term': (
tname + "[Organism] AND "+
gene + "[gene] " +
"AND 500:9999999999[slen] " +
"AND src country[prop] " +
"AND src specimen voucher[prop] " +
"AND src pcr primers[prop]"
)
}
result = requests.get(api_url, params).json()
try:
record_counts[gene] = int(result['esearchresult']['count'])
except KeyError:
record_counts[gene] = 0
# ITS
its_params = {
'db': 'nuccore',
'retmode': 'json',
'rettype': 'count',
'api_key': ncbikey,
'term': (
tname + '[Organism] '
'AND internal transcribed spacer[All Fields] ' +
'AND src country[prop] ' +
'AND src specimen voucher[prop] ' +
'AND src pcr primers[prop]'
)
}
its_result = requests.get(api_url, its_params).json()
try:
record_counts['its'] = int(its_result['esearchresult']['count'])
except KeyError:
record_counts['its'] = 0
# 12s
gene12s_params = {
'db': 'nuccore',
'retmode': 'json',
'rettype': 'count',
'api_key': ncbikey,
'term': (
tname + '[Organism] '
'AND 12s[All Fields] ' +
'AND src country[prop] ' +
'AND src specimen voucher[prop] ' +
'AND src pcr primers[prop]'
)
}
gene12s_result = requests.get(api_url, gene12s_params).json()
try:
record_counts['12s'] = int(gene12s_result['esearchresult']['count'])
except KeyError:
record_counts['12s'] = 0
return record_counts
def processTaxNames(namelist, ncbikey):
"""Retrieve results from CoL, GGBN, and GenBank for each name in a supplied
list of taxonomic names. Return results as a list of dictionaries."""
allResults = []
for t in progressbar.progressbar(namelist):
# for t in namelist:
result = {}
result["name"] = t
result["col"] = inCOL(t)
result["ggbn"] = inGGBN(t)
result["genbank_any"] = inGenBankAny(t, ncbikey)
if result['genbank_any'] > 0:
barcodes = inGenBankBarcode(t, ncbikey)
result['genbank_barcodes_coi'] = barcodes['coi']
result['genbank_barcodes_cox1'] = barcodes['cox1']
result['genbank_barcodes_rbcl'] = barcodes['rbcl']
result['genbank_barcodes_matk'] = barcodes['matk']
result['genbank_barcodes_its'] = barcodes['its']
result['genbank_barcodes_12s'] = barcodes['12s']
else:
result['genbank_barcodes_coi'] = 0
result['genbank_barcodes_cox1'] = 0
result['genbank_barcodes_rbcl'] = 0
result['genbank_barcodes_matk'] = 0
result['genbank_barcodes_its'] = 0
result['genbank_barcodes_12s'] = 0
allResults.append(result)
# print(f"{t} done!")
return allResults
if __name__ == '__main__':
# Retrieve name file and source options supplied as argument
parser = argparse.ArgumentParser()
parser.add_argument('namesfile', help = '.txt file containing taxonomic names to query', type = str)
parser.add_argument('ncbikey', help = 'API Key supplied by NCBI', type = str)
args = parser.parse_args()
# Read name file from supplied parameters
with open(args.namesfile, 'r') as f:
taxnames = [x.strip() for x in f.readlines()]
# Retrieve results from name queries
searchresults = processTaxNames(taxnames, args.ncbikey)
# Output results as a CSV
df = pd.DataFrame(searchresults)
df = df[[
'name',
'col',
'ggbn',
'genbank_any',
'genbank_barcodes_coi',
'genbank_barcodes_cox1',
'genbank_barcodes_rbcl',
'genbank_barcodes_matk',
'genbank_barcodes_its',
'genbank_barcodes_12s'
]]
df.to_csv('real_time_name_query_results.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment