Last active
December 20, 2015 06:09
-
-
Save gregcaporaso/6083538 to your computer and use it in GitHub Desktop.
code for converting blast "bl6" file to assignments (e.g., functional, taxonomic, etc).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from os.path import join | |
query_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/rep_set.fna" | |
reference_seqs_fp = "/data/gg_13_5_otus/rep_set/97_otus.fasta" | |
reference_tax_fp = "/data/gg_13_5_otus/taxonomy/97_otu_taxonomy.txt" | |
input_biom_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/otu_table_mc2_no_pynast_failures.biom" | |
output_biom_fn = "otu_table_mc2_no_pynast_failures_w_taxa.biom" | |
output_dir = "/home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/usearch_v_97/" | |
uc_to_assignments_cmd = "python /home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/uc_to_assignments/uc_to_assignments.py" | |
f = open('usearch_cmds.sh','w') | |
f.write('#!/usr/bin/env bash\n') | |
f.write('\n') | |
for evalue in [1e0, 1e-2, 1e-4, 1e-6, 1e-8, 1e-10]: | |
for queryfract in [0.25, 0.50, 0.75, 1.0]: | |
for maxaccepts in [1, 10, 50, 100]: | |
for confidence in [0.50, 0.75, 1.0]: | |
run_dir = '%s/usearch_%e.%1.1e.%d.%1.1e/' % (output_dir, evalue, queryfract, maxaccepts, confidence) | |
uc_fp = join(run_dir, 'results.uc') | |
taxmap_fp = join(run_dir,'rep_set_tax_assignments.txt') | |
output_biom_fp = join(run_dir, output_biom_fn) | |
mkdir_cmd = "mkdir %s" % run_dir | |
uc_cmd = "usearch -query %s -db %s -evalue %f -uc %s -queryfract %f -maxaccepts %d" % (query_fp, reference_seqs_fp, evalue, uc_fp, queryfract, maxaccepts) | |
taxmap_cmd = "%s -i %s -t %s -o %s -c %s" % (uc_to_assignments_cmd, uc_fp, reference_tax_fp, taxmap_fp, confidence) | |
biom_cmd = "add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy" % (input_biom_fp, taxmap_fp, output_biom_fp) | |
summ_cmd = "summarize_taxa.py -i %s -o %s" % (output_biom_fp, run_dir) | |
f.write('; '.join([mkdir_cmd, uc_cmd, taxmap_cmd, biom_cmd, summ_cmd])) | |
f.write('\n') | |
f.close() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 25 Jul 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2011, The QIIME project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.7.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from shutil import rmtree | |
from os.path import exists, join | |
from cogent.util.unit_test import TestCase, main | |
from cogent.util.misc import remove_files, create_dir | |
from qiime.util import (get_qiime_temp_dir, | |
get_tmp_filename) | |
from qiime.test import initiate_timeout, disable_timeout | |
from uc_to_assignments import (bl6_to_assignment, | |
bl6_to_assignments, | |
get_consensus_assignment, | |
parse_id_to_tax) | |
class Bl6ToTaxTests(TestCase): | |
def setUp(self): | |
self.bl61 = bl61.split('\n') | |
self.bl62 = bl62.split('\n') | |
self.id_to_tax1 = parse_id_to_tax(id_to_tax1.split('\n')) | |
def test_get_consensus_assignment(self): | |
""" """ | |
in1 = [['Ab','Bc','De'], | |
['Ab','Bc','Fg','Hi'], | |
['Ab','Bc','Fg','Jk']] | |
expected = [('Ab',1.0), | |
('Bc',1.0), | |
('Fg',2./3.), | |
(None,0), | |
(None,0), | |
(None,0), | |
(None,0)] | |
self.assertEqual(get_consensus_assignment(in1,num_levels=7), | |
expected) | |
def test_bl6_to_assignment_non_redundant(self): | |
""" bl6_to_tax_assignments handles non-redundant data as expected """ | |
expected = {'q1':([('A',1.0),('F',1.0),('G',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2), | |
'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1), | |
'q3':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],3)} | |
actual = bl6_to_assignment(self.bl61,self.id_to_tax1) | |
self.assertEqual(actual,expected) | |
def test_bl6_to_assignment_redundant(self): | |
""" bl6_to_tax_assignments handles non-redundant data as expected """ | |
expected = {'q1':([('A',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2), | |
'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1), | |
'q3':([('A',1.0),('H',1.0),('I',2./3.),('J',2./3.),(None,0.0),(None,0.0),(None,0.0)],3)} | |
actual = bl6_to_assignment(self.bl62,self.id_to_tax1) | |
self.assertEqual(actual,expected) | |
def test_bl6_to_assignments(self): | |
""" """ | |
expected = {'q1':[['A','F','G'],['A','B','C','D']], | |
'q2':[['A','H','I','J']], | |
'q3':[['A','H','I','J'],['A','H','K','L','M'],['A','H','I','J']]} | |
actual = bl6_to_assignments(self.bl62,self.id_to_tax1) | |
self.assertEqual(actual,expected) | |
bl61 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86 | |
q1 r1 86.0 50 7 0 1182 1231 747 796 5.6e-10 55 | |
q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33 | |
q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60 | |
q3 r3 86.3 51 7 0 235 285 143 193 1.5e-10 57 | |
q3 r3 88.0 50 6 0 875 924 747 796 1.1e-11 60 | |
""" | |
bl62 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86 | |
q1 r2 86.0 50 7 0 1182 1231 747 796 5.6e-10 55 | |
q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33 | |
q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60 | |
q3 r5 86.3 51 7 0 235 285 143 193 1.5e-10 57 | |
q3 r6 88.0 50 6 0 875 924 747 796 1.1e-11 60 | |
""" | |
id_to_tax1 = """r1 A;F;G | |
r2 A;B;C;D | |
r3 A;H;I;J | |
r4 A;B;C;E | |
r5 A;H;K;L;M | |
r6 A;H;I;J | |
""" | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 25 Jul 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2013, The BiPy project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "0.0.0" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from collections import Counter | |
from qcli import (parse_command_line_parameters, | |
make_option) | |
script_info = {} | |
script_info['brief_description'] = "" | |
script_info['script_description'] = "" | |
script_info['script_usage'] = [] | |
script_info['script_usage'].append(("","","")) | |
script_info['output_description']= "" | |
script_info['required_options'] = [ | |
make_option('-i','--input_fp',type="existing_filepath",help='the input uc or bl6 filepath'), | |
make_option('-t','--taxonomy_map_fp',type="existing_filepath",help='the input id-to-taxonomy filepath'), | |
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'), | |
make_option('-c','--min_confidence',type="float", | |
help='the minimum acceptable confidence value to retain taxonomy at a particular level'), | |
] | |
script_info['optional_options'] = [] | |
script_info['version'] = __version__ | |
def get_consensus_assignment(assignments, | |
num_levels=7, | |
min_fraction=0.5): | |
assert num_levels > 0, "num_levels must be greater than zero" | |
num_assignments = len(assignments) | |
count = 1 / num_assignments | |
results = [] | |
for level in range(num_levels): | |
current_level_assignments = Counter() | |
for e in assignments: | |
try: | |
current_level_assignment = e[level] | |
except IndexError: | |
pass | |
else: | |
current_level_assignments[e[level]] += count | |
counts = [(v,k) for k, v in current_level_assignments.items() if v > min_fraction] | |
counts.sort() | |
try: | |
max_frac, max_tax = counts[-1] | |
except IndexError: | |
max_frac, max_tax = 0.0, None | |
results.append((max_tax,max_frac)) | |
return results | |
def bl6_to_assignment(bl6, | |
id_to_tax, | |
num_levels=7): | |
results = bl6_to_assignments(bl6,id_to_tax) | |
for query_id, all_assignments in results.items(): | |
consensus_assignment = get_consensus_assignment(all_assignments) | |
results[query_id] = consensus_assignment, len(all_assignments) | |
return results | |
def parse_id_to_tax(id_to_tax_f): | |
""" """ | |
result = {} | |
for line in id_to_tax_f: | |
if line.startswith('#') or line == "": | |
continue | |
else: | |
fields = line.split('\t') | |
id_ = fields[0] | |
tax = [e.strip() for e in fields[1].split(';')] | |
result[id_] = tax | |
return result | |
def bl6_to_assignments(bl6, | |
id_to_tax): | |
results = {} | |
for line in bl6: | |
line = line.strip() | |
if line.startswith('#') or line == "": | |
continue | |
else: | |
fields = line.split('\t') | |
query_id = fields[0].split()[0] | |
subject_id = fields[1].split()[0] | |
tax = id_to_tax[subject_id] | |
try: | |
results[query_id].append(tax) | |
except KeyError: | |
results[query_id] = [tax] | |
return results | |
def uc_to_assignments(uc, | |
id_to_tax): | |
results = {} | |
for line in uc: | |
line = line.strip() | |
if line.startswith('#') or line == "": | |
continue | |
elif line.startswith('H'): | |
fields = line.split('\t') | |
query_id = fields[8].split()[0] | |
subject_id = fields[9].split()[0] | |
tax = id_to_tax[subject_id] | |
try: | |
results[query_id].append(tax) | |
except KeyError: | |
results[query_id] = [tax] | |
elif line.startswith('N'): | |
fields = line.split('\t') | |
query_id = fields[8].split()[0] | |
try: | |
results[query_id].append(["None"]) | |
except KeyError: | |
results[query_id] = [["None"]] | |
return results | |
def uc_to_assignment(uc, | |
id_to_tax, | |
num_levels=7): | |
results = uc_to_assignments(uc,id_to_tax) | |
for query_id, all_assignments in results.items(): | |
consensus_assignment = get_consensus_assignment(all_assignments) | |
results[query_id] = consensus_assignment, len(all_assignments) | |
return results | |
def main(): | |
option_parser, opts, args =\ | |
parse_command_line_parameters(**script_info) | |
min_confidence = opts.min_confidence | |
output_f = open(opts.output_fp,'w') | |
if opts.input_fp.endswith('.bl6'): | |
result = bl6_to_assignment(open(opts.input_fp,'U'), | |
parse_id_to_tax(open(opts.taxonomy_map_fp,'U'))) | |
elif opts.input_fp.endswith('.uc'): | |
result = uc_to_assignment(open(opts.input_fp,'U'), | |
parse_id_to_tax(open(opts.taxonomy_map_fp,'U'))) | |
else: | |
option_parser.error("--input_fp must end with either .uc or .bl6") | |
output_f.write("#OTU ID\ttaxonomy\tconfidence\tn\n") | |
for id_, r in result.items(): | |
output_confidence = 0.0 | |
output_tax = [] | |
for tax, conf in r[0]: | |
if conf >= min_confidence: | |
output_tax.append(tax) | |
output_confidence = conf | |
else: | |
break | |
output_tax = ';'.join(output_tax) | |
output_f.write('\t'.join([id_,output_tax,str(output_confidence),str(r[1])])) | |
output_f.write('\n') | |
output_f.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment