Skip to content

Instantly share code, notes, and snippets.

@philippbayer
Created August 8, 2023 06:40
Show Gist options
  • Save philippbayer/97505bba4a2735af79f664727db99e08 to your computer and use it in GitHub Desktop.
Save philippbayer/97505bba4a2735af79f664727db99e08 to your computer and use it in GitHub Desktop.
import os
import sys
import argparse
from statistics import mean
'''
INPUT: tab-delimited blastn output. Assuming that taxonomy ID is in this format:
-outfmt "6 qseqid sseqid staxids sscinames scomnames sskingdoms pident length qlen slen mismatch gapopen gaps qstart qend sstart send stitle evalue bitscore qcovs qcovhsp"
This script also assumes that input has been filtered by 90% identity.
$ awk '{if ($7 > 90) print}' all_results.tsv > all_results.90perc.tsv
And very vague hits have been removed, too:
$ grep -v -e uncultured -e Uncultured -e chloroplast -e Unidentified -e unidentified all_results.90perc.tsv > all_results.90perc.noUnculturedUnidentifiedChloroplast.tsv
This script also assumes that taxonkit is installed and that the data has been downloaded to the default directory.
mamba create -n taxonkit taxonkit -y
conda activate taxonkit
mkdir ~/.taxonkit
cd ~/.taxonkit
wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
tar xzvf taxdump.tar.gz
Usage: python computeLCA.py filtered_blast.tsv > Table_of_TaxonIDs.tsv
Currently, the output looks like this:
ASV_0 6833 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Gymnoplea;Calanoida Calanoida
ASV_1 6833 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Gymnoplea;Calanoida Calanoida
ASV_2 6833 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Gymnoplea;Calanoida Calanoida
ASV_3 6833 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Gymnoplea;Calanoida Calanoida
ASV_4 349674 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Podoplea;Poecilostomatoida;Oncaeidae;Oncaea Oncaea
ASV_5 349674 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Crustacea;Multicrustacea;Hexanauplia;Copepoda;Neocopepoda;Podoplea;Poecilostomatoida;Oncaeidae;Oncaea Oncaea
....
'''
def main():
DATA_DIR = '~/.taxonkit/' # Where taxonkit data is stored.
parser = argparse.ArgumentParser(description='Parses blastn output, computes LCA for all queries using taxonkit.')
parser.add_argument('blastn', metavar='FILE', type=str,
help='''Filename of blastn outfmt 6 output. Ideally
"6 qseqid sseqid staxids sscinames scomnames sskingdoms pident length
qlen slen mismatch gapopen gaps qstart qend sstart send stitle evalue bitscore qcovs qcovhsp"''')
parser.add_argument('--data-dir', metavar='DIR', type=str, default=DATA_DIR,
help=f'Where taxonkit data is stored. Defaults to "{DATA_DIR}".')
args = parser.parse_args()
data_dir = args.data_dir
current_query = None
taxons = None
list_of_queries = []
list_of_percentages = []
list_of_lengths = []
with open(args.blastn) as fh, open('TEMP_TAXONS.txt', 'w') as out:
for line in fh:
ll = line.rstrip().split('\t')
query = ll[0]
taxid = ll[2]
perc = float(ll[6])
length = float(ll[7])
if ';' in taxid:
# in some cases, an NCBI-NT entry has several taxonomy IDs like 1;2;321;5123413.
# It seems that these few entries have the entire taxonomy tree. Just use the last one.
taxid = taxid.split(';')[-1]
#if ll[3] == 'N/A':
# this Taxonomy ID comes from Entrez but it's not in taxonkit. not sure how this happens
#print(ll)
#continue
if current_query != query:
if taxons:
# write out the taxons to temporary files, removing duplicates
out.write(current_query + '\t' + ' '.join(set(taxons)) + '\n')
list_of_percentages.append(mean(percentages))
list_of_lengths.append(mean(lengths))
# printed all for the last query, now start collecting for the new query
taxons = [taxid]
percentages = [perc]
lengths = [length]
current_query = query
list_of_queries.append(current_query)
else:
# current query is same as before, add to list
taxons.append(taxid)
percentages.append(perc)
lengths.append(length)
# print for the very last query, too
if taxons:
#out.write(' '.join(set(taxons)) + '\n')
out.write(current_query + '\t' + ' '.join(set(taxons)) + '\n')
list_of_percentages.append(mean(percentages))
list_of_lengths.append(mean(lengths))
all_taxa = os.popen('taxonkit lca -i 2 TEMP_TAXONS.txt | cut -f 3 | taxonkit lineage -n').readlines()
#all_taxa = os.popen('taxonkit lca -D TEMP_TAXONS.txt | cut -f 2 | taxonkit lineage -n | taxonkit reformat --format "{k};{p};{c};{o};{f};{g};{s}"').read().split('\n')
#all_taxa = list(filter(None, all_taxa)) # the last line is empty for some reason. Extra linebreak?
assert len(all_taxa) == len(list_of_queries), f'ERROR: there are {len(list_of_queries)} queries but {len(all_taxa)} taxa in the TEMP_TAXONS.txt file'
reformatted_taxa = os.popen('taxonkit lca -i 2 TEMP_TAXONS.txt | cut -f 3 | taxonkit lineage -n | taxonkit reformat --format "{o};{f};{g};{s}" > TAXONKIT_RESULTS.txt').read()
reformatted_taxa = open('TAXONKIT_RESULTS.txt').readlines()
for q, t, l, p, x in zip(list_of_queries, all_taxa, reformatted_taxa, list_of_percentages, list_of_lengths):
# 119195 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes;Elasmobranchii;Selachii;Galeomorphii;Galeoidea Galeoidea Eukaryota;Chordata;Chondrichthyes;;;;
newnames = t.split('\t')
# ['55137', 'cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes;Elasmobranchii;Batoidea;Myliobatiformes;Myliobatidae;Myliobatinae;Myliobatis', 'Myliobatis']
lineage = newnames[1]
#if 'Chondrichthyes' in lineage:
# lineage = ';'.join(lineage[:16])
#else:
# lineage = ';'.join(lineage[:17])
#186625 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii;Actinopteri;Neopterygii;Teleostei;Osteoglossocephalai;Clupeocephala Clupeocephala ;;
family_genus_s = l.rstrip().split('\t')[-1]
newlineage = lineage+';'+family_genus_s
last_one = ''
for i in newlineage.split(';'):
if i:
last_one = i
print(q + '\t' + lineage+';'+family_genus_s + '\t' + last_one + '\t%.2f\t%.0f'%(p, x) )
#print(f'{q}\t{newlist}\t{lastone}')
#os.remove("TEMP_TAXONS.txt")
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment