Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 20, 2015 06:09
Show Gist options
  • Save gregcaporaso/6083538 to your computer and use it in GitHub Desktop.
Save gregcaporaso/6083538 to your computer and use it in GitHub Desktop.
code for converting blast "bl6" file to assignments (e.g., functional, taxonomic, etc).
#!/usr/bin/env python
from os.path import join
query_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/rep_set.fna"
reference_seqs_fp = "/data/gg_13_5_otus/rep_set/97_otus.fasta"
reference_tax_fp = "/data/gg_13_5_otus/taxonomy/97_otu_taxonomy.txt"
input_biom_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/otu_table_mc2_no_pynast_failures.biom"
output_biom_fn = "otu_table_mc2_no_pynast_failures_w_taxa.biom"
output_dir = "/home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/usearch_v_97/"
uc_to_assignments_cmd = "python /home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/uc_to_assignments/uc_to_assignments.py"
f = open('usearch_cmds.sh','w')
f.write('#!/usr/bin/env bash\n')
f.write('\n')
for evalue in [1e0, 1e-2, 1e-4, 1e-6, 1e-8, 1e-10]:
for queryfract in [0.25, 0.50, 0.75, 1.0]:
for maxaccepts in [1, 10, 50, 100]:
for confidence in [0.50, 0.75, 1.0]:
run_dir = '%s/usearch_%e.%1.1e.%d.%1.1e/' % (output_dir, evalue, queryfract, maxaccepts, confidence)
uc_fp = join(run_dir, 'results.uc')
taxmap_fp = join(run_dir,'rep_set_tax_assignments.txt')
output_biom_fp = join(run_dir, output_biom_fn)
mkdir_cmd = "mkdir %s" % run_dir
uc_cmd = "usearch -query %s -db %s -evalue %f -uc %s -queryfract %f -maxaccepts %d" % (query_fp, reference_seqs_fp, evalue, uc_fp, queryfract, maxaccepts)
taxmap_cmd = "%s -i %s -t %s -o %s -c %s" % (uc_to_assignments_cmd, uc_fp, reference_tax_fp, taxmap_fp, confidence)
biom_cmd = "add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy" % (input_biom_fp, taxmap_fp, output_biom_fp)
summ_cmd = "summarize_taxa.py -i %s -o %s" % (output_biom_fp, run_dir)
f.write('; '.join([mkdir_cmd, uc_cmd, taxmap_cmd, biom_cmd, summ_cmd]))
f.write('\n')
f.close()
#!/usr/bin/env python
# File created on 25 Jul 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.7.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from shutil import rmtree
from os.path import exists, join
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files, create_dir
from qiime.util import (get_qiime_temp_dir,
get_tmp_filename)
from qiime.test import initiate_timeout, disable_timeout
from uc_to_assignments import (bl6_to_assignment,
bl6_to_assignments,
get_consensus_assignment,
parse_id_to_tax)
class Bl6ToTaxTests(TestCase):
def setUp(self):
self.bl61 = bl61.split('\n')
self.bl62 = bl62.split('\n')
self.id_to_tax1 = parse_id_to_tax(id_to_tax1.split('\n'))
def test_get_consensus_assignment(self):
""" """
in1 = [['Ab','Bc','De'],
['Ab','Bc','Fg','Hi'],
['Ab','Bc','Fg','Jk']]
expected = [('Ab',1.0),
('Bc',1.0),
('Fg',2./3.),
(None,0),
(None,0),
(None,0),
(None,0)]
self.assertEqual(get_consensus_assignment(in1,num_levels=7),
expected)
def test_bl6_to_assignment_non_redundant(self):
""" bl6_to_tax_assignments handles non-redundant data as expected """
expected = {'q1':([('A',1.0),('F',1.0),('G',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
'q3':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],3)}
actual = bl6_to_assignment(self.bl61,self.id_to_tax1)
self.assertEqual(actual,expected)
def test_bl6_to_assignment_redundant(self):
""" bl6_to_tax_assignments handles non-redundant data as expected """
expected = {'q1':([('A',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
'q3':([('A',1.0),('H',1.0),('I',2./3.),('J',2./3.),(None,0.0),(None,0.0),(None,0.0)],3)}
actual = bl6_to_assignment(self.bl62,self.id_to_tax1)
self.assertEqual(actual,expected)
def test_bl6_to_assignments(self):
""" """
expected = {'q1':[['A','F','G'],['A','B','C','D']],
'q2':[['A','H','I','J']],
'q3':[['A','H','I','J'],['A','H','K','L','M'],['A','H','I','J']]}
actual = bl6_to_assignments(self.bl62,self.id_to_tax1)
self.assertEqual(actual,expected)
bl61 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86
q1 r1 86.0 50 7 0 1182 1231 747 796 5.6e-10 55
q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33
q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60
q3 r3 86.3 51 7 0 235 285 143 193 1.5e-10 57
q3 r3 88.0 50 6 0 875 924 747 796 1.1e-11 60
"""
bl62 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86
q1 r2 86.0 50 7 0 1182 1231 747 796 5.6e-10 55
q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33
q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60
q3 r5 86.3 51 7 0 235 285 143 193 1.5e-10 57
q3 r6 88.0 50 6 0 875 924 747 796 1.1e-11 60
"""
id_to_tax1 = """r1 A;F;G
r2 A;B;C;D
r3 A;H;I;J
r4 A;B;C;E
r5 A;H;K;L;M
r6 A;H;I;J
"""
if __name__ == "__main__":
main()
#!/usr/bin/env python
# File created on 25 Jul 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2013, The BiPy project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "0.0.0"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from collections import Counter
from qcli import (parse_command_line_parameters,
make_option)
script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("","",""))
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_fp',type="existing_filepath",help='the input uc or bl6 filepath'),
make_option('-t','--taxonomy_map_fp',type="existing_filepath",help='the input id-to-taxonomy filepath'),
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
make_option('-c','--min_confidence',type="float",
help='the minimum acceptable confidence value to retain taxonomy at a particular level'),
]
script_info['optional_options'] = []
script_info['version'] = __version__
def get_consensus_assignment(assignments,
num_levels=7,
min_fraction=0.5):
assert num_levels > 0, "num_levels must be greater than zero"
num_assignments = len(assignments)
count = 1 / num_assignments
results = []
for level in range(num_levels):
current_level_assignments = Counter()
for e in assignments:
try:
current_level_assignment = e[level]
except IndexError:
pass
else:
current_level_assignments[e[level]] += count
counts = [(v,k) for k, v in current_level_assignments.items() if v > min_fraction]
counts.sort()
try:
max_frac, max_tax = counts[-1]
except IndexError:
max_frac, max_tax = 0.0, None
results.append((max_tax,max_frac))
return results
def bl6_to_assignment(bl6,
id_to_tax,
num_levels=7):
results = bl6_to_assignments(bl6,id_to_tax)
for query_id, all_assignments in results.items():
consensus_assignment = get_consensus_assignment(all_assignments)
results[query_id] = consensus_assignment, len(all_assignments)
return results
def parse_id_to_tax(id_to_tax_f):
""" """
result = {}
for line in id_to_tax_f:
if line.startswith('#') or line == "":
continue
else:
fields = line.split('\t')
id_ = fields[0]
tax = [e.strip() for e in fields[1].split(';')]
result[id_] = tax
return result
def bl6_to_assignments(bl6,
id_to_tax):
results = {}
for line in bl6:
line = line.strip()
if line.startswith('#') or line == "":
continue
else:
fields = line.split('\t')
query_id = fields[0].split()[0]
subject_id = fields[1].split()[0]
tax = id_to_tax[subject_id]
try:
results[query_id].append(tax)
except KeyError:
results[query_id] = [tax]
return results
def uc_to_assignments(uc,
id_to_tax):
results = {}
for line in uc:
line = line.strip()
if line.startswith('#') or line == "":
continue
elif line.startswith('H'):
fields = line.split('\t')
query_id = fields[8].split()[0]
subject_id = fields[9].split()[0]
tax = id_to_tax[subject_id]
try:
results[query_id].append(tax)
except KeyError:
results[query_id] = [tax]
elif line.startswith('N'):
fields = line.split('\t')
query_id = fields[8].split()[0]
try:
results[query_id].append(["None"])
except KeyError:
results[query_id] = [["None"]]
return results
def uc_to_assignment(uc,
id_to_tax,
num_levels=7):
results = uc_to_assignments(uc,id_to_tax)
for query_id, all_assignments in results.items():
consensus_assignment = get_consensus_assignment(all_assignments)
results[query_id] = consensus_assignment, len(all_assignments)
return results
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
min_confidence = opts.min_confidence
output_f = open(opts.output_fp,'w')
if opts.input_fp.endswith('.bl6'):
result = bl6_to_assignment(open(opts.input_fp,'U'),
parse_id_to_tax(open(opts.taxonomy_map_fp,'U')))
elif opts.input_fp.endswith('.uc'):
result = uc_to_assignment(open(opts.input_fp,'U'),
parse_id_to_tax(open(opts.taxonomy_map_fp,'U')))
else:
option_parser.error("--input_fp must end with either .uc or .bl6")
output_f.write("#OTU ID\ttaxonomy\tconfidence\tn\n")
for id_, r in result.items():
output_confidence = 0.0
output_tax = []
for tax, conf in r[0]:
if conf >= min_confidence:
output_tax.append(tax)
output_confidence = conf
else:
break
output_tax = ';'.join(output_tax)
output_f.write('\t'.join([id_,output_tax,str(output_confidence),str(r[1])]))
output_f.write('\n')
output_f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment