gregcaporaso · December 20, 2015 06:09
diff --git a/generate_usearch_cmds.py b/generate_usearch_cmds.py
 #!/usr/bin/env python

 from os.path import join

 query_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/rep_set.fna"
 reference_seqs_fp = "/data/gg_13_5_otus/rep_set/97_otus.fasta"
 reference_tax_fp = "/data/gg_13_5_otus/taxonomy/97_otu_taxonomy.txt"
 input_biom_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/otu_table_mc2_no_pynast_failures.biom"
 output_biom_fn = "otu_table_mc2_no_pynast_failures_w_taxa.biom"
 output_dir = "/home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/usearch_v_97/"
 uc_to_assignments_cmd = "python /home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/uc_to_assignments/uc_to_assignments.py"

 f = open('usearch_cmds.sh','w')
 f.write('#!/usr/bin/env bash\n')
 f.write('\n')

 for evalue in [1e0, 1e-2, 1e-4, 1e-6, 1e-8, 1e-10]:
    for queryfract in [0.25, 0.50, 0.75, 1.0]:
        for maxaccepts in [1, 10, 50, 100]:
            for confidence in [0.50, 0.75, 1.0]:
                run_dir = '%s/usearch_%e.%1.1e.%d.%1.1e/' % (output_dir, evalue, queryfract, maxaccepts, confidence)
                uc_fp = join(run_dir, 'results.uc')
                taxmap_fp = join(run_dir,'rep_set_tax_assignments.txt')
                output_biom_fp = join(run_dir, output_biom_fn) 
                mkdir_cmd = "mkdir %s" % run_dir
                uc_cmd = "usearch -query %s -db %s -evalue %f -uc %s -queryfract %f -maxaccepts %d" % (query_fp, reference_seqs_fp, evalue, uc_fp, queryfract, maxaccepts)
                taxmap_cmd = "%s -i %s -t %s -o %s -c %s" % (uc_to_assignments_cmd, uc_fp, reference_tax_fp, taxmap_fp, confidence)
                biom_cmd = "add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy" % (input_biom_fp, taxmap_fp, output_biom_fp)
                summ_cmd = "summarize_taxa.py -i %s -o %s" % (output_biom_fp, run_dir)
                f.write('; '.join([mkdir_cmd, uc_cmd, taxmap_cmd, biom_cmd, summ_cmd]))
                f.write('\n')
 f.close()


diff --git a/test_uc_to_assignments.py b/test_uc_to_assignments.py
 #!/usr/bin/env python
 # File created on 25 Jul 2013
 from __future__ import division

 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2011, The QIIME project"
 __credits__ = ["Greg Caporaso"]
 __license__ = "GPL"
 __version__ = "1.7.0-dev"
 __maintainer__ = "Greg Caporaso"
 __email__ = "[email protected]"
 __status__ = "Development"


 from shutil import rmtree
 from os.path import exists, join
 from cogent.util.unit_test import TestCase, main
 from cogent.util.misc import remove_files, create_dir
 from qiime.util import (get_qiime_temp_dir, 
                        get_tmp_filename)
 from qiime.test import initiate_timeout, disable_timeout
 from uc_to_assignments import (bl6_to_assignment,
                                bl6_to_assignments,
                                get_consensus_assignment,
                                parse_id_to_tax)

 class Bl6ToTaxTests(TestCase):
    
    def setUp(self):
        
        self.bl61 = bl61.split('\n')
        self.bl62 = bl62.split('\n')
        self.id_to_tax1 = parse_id_to_tax(id_to_tax1.split('\n'))

    def test_get_consensus_assignment(self):
        """ """
        in1 = [['Ab','Bc','De'],
               ['Ab','Bc','Fg','Hi'],
               ['Ab','Bc','Fg','Jk']]
        expected = [('Ab',1.0),
                    ('Bc',1.0),
                    ('Fg',2./3.),
                    (None,0),
                    (None,0),
                    (None,0),
                    (None,0)]
        self.assertEqual(get_consensus_assignment(in1,num_levels=7),
                         expected)

    def test_bl6_to_assignment_non_redundant(self):
        """ bl6_to_tax_assignments handles non-redundant data as expected """
        expected = {'q1':([('A',1.0),('F',1.0),('G',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
                    'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
                    'q3':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],3)}
        actual = bl6_to_assignment(self.bl61,self.id_to_tax1)
        self.assertEqual(actual,expected)

    def test_bl6_to_assignment_redundant(self):
        """ bl6_to_tax_assignments handles non-redundant data as expected """
        expected = {'q1':([('A',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
                    'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
                    'q3':([('A',1.0),('H',1.0),('I',2./3.),('J',2./3.),(None,0.0),(None,0.0),(None,0.0)],3)}
        actual = bl6_to_assignment(self.bl62,self.id_to_tax1)
        self.assertEqual(actual,expected)
    
    def test_bl6_to_assignments(self):
        """ """
        expected = {'q1':[['A','F','G'],['A','B','C','D']],
                    'q2':[['A','H','I','J']],
                    'q3':[['A','H','I','J'],['A','H','K','L','M'],['A','H','I','J']]}
        actual = bl6_to_assignments(self.bl62,self.id_to_tax1)
        self.assertEqual(actual,expected)

 bl61 = """q1	r1	88.6	70	8	0	1055	1124	616	685	2e-19	86
 q1	r1	86.0	50	7	0	1182	1231	747	796	5.6e-10	55
 q2	r3	77.4	53	12	0	578	630	156	208	0.0026	33
 q3	r3	84.7	59	9	0	760	818	627	685	1.1e-11	60
 q3	r3	86.3	51	7	0	235	285	143	193	1.5e-10	57
 q3	r3	88.0	50	6	0	875	924	747	796	1.1e-11	60
 """

 bl62 = """q1	r1	88.6	70	8	0	1055	1124	616	685	2e-19	86
 q1	r2	86.0	50	7	0	1182	1231	747	796	5.6e-10	55
 q2	r3	77.4	53	12	0	578	630	156	208	0.0026	33
 q3	r3	84.7	59	9	0	760	818	627	685	1.1e-11	60
 q3	r5	86.3	51	7	0	235	285	143	193	1.5e-10	57
 q3	r6	88.0	50	6	0	875	924	747	796	1.1e-11	60
 """

 id_to_tax1 = """r1	A;F;G
 r2	A;B;C;D
 r3	A;H;I;J
 r4	A;B;C;E
 r5	A;H;K;L;M
 r6	A;H;I;J
 """




 if __name__ == "__main__":
    main()
diff --git a/uc_to_assignments.py b/uc_to_assignments.py
 #!/usr/bin/env python
 # File created on 25 Jul 2013
 from __future__ import division

 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2013, The BiPy project"
 __credits__ = ["Greg Caporaso"]
 __license__ = "GPL"
 __version__ = "0.0.0"
 __maintainer__ = "Greg Caporaso"
 __email__ = "[email protected]"
 __status__ = "Development"

 from collections import Counter
 from qcli import (parse_command_line_parameters, 
                  make_option)

 script_info = {}
 script_info['brief_description'] = ""
 script_info['script_description'] = ""
 script_info['script_usage'] = []
 script_info['script_usage'].append(("","",""))
 script_info['output_description']= ""
 script_info['required_options'] = [
 make_option('-i','--input_fp',type="existing_filepath",help='the input uc or bl6 filepath'),
 make_option('-t','--taxonomy_map_fp',type="existing_filepath",help='the input id-to-taxonomy filepath'),
 make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
 make_option('-c','--min_confidence',type="float",
             help='the minimum acceptable confidence value to retain taxonomy at a particular level'),
 ]
 script_info['optional_options'] = []
 script_info['version'] = __version__

 def get_consensus_assignment(assignments,
                             num_levels=7,
                             min_fraction=0.5):
    assert num_levels > 0, "num_levels must be greater than zero"
    num_assignments = len(assignments)
    count = 1 / num_assignments
    results = []
    for level in range(num_levels):
        current_level_assignments = Counter()
        for e in assignments:
            try:
                current_level_assignment = e[level]
            except IndexError:
                pass
            else:
                current_level_assignments[e[level]] += count
        counts = [(v,k) for k, v in current_level_assignments.items() if v > min_fraction]
        counts.sort()
        try:
            max_frac, max_tax = counts[-1]
        except IndexError:
            max_frac, max_tax = 0.0, None
        results.append((max_tax,max_frac))
    return results
        

 def bl6_to_assignment(bl6,
                      id_to_tax,
                      num_levels=7):
    results = bl6_to_assignments(bl6,id_to_tax)
    for query_id, all_assignments in results.items():
        consensus_assignment = get_consensus_assignment(all_assignments)
        results[query_id] = consensus_assignment, len(all_assignments)
    return results

 def parse_id_to_tax(id_to_tax_f):
    """ """
    result = {}
    for line in id_to_tax_f:
        if line.startswith('#') or line == "":
            continue
        else:
            fields = line.split('\t')
            id_ = fields[0]
            tax = [e.strip() for e in fields[1].split(';')]
            result[id_] = tax
    return result

 def bl6_to_assignments(bl6,
                       id_to_tax):
    results = {}
    for line in bl6:
        line = line.strip()
        if line.startswith('#') or line == "":
            continue
        else:
            fields = line.split('\t')
            query_id = fields[0].split()[0]
            subject_id = fields[1].split()[0]
            tax = id_to_tax[subject_id]
            try:
                results[query_id].append(tax)
            except KeyError:
                results[query_id] = [tax]
    return results

 def uc_to_assignments(uc,
                      id_to_tax):
    results = {}
    for line in uc:
        line = line.strip()
        if line.startswith('#') or line == "":
            continue
        elif line.startswith('H'):
            fields = line.split('\t')
            query_id = fields[8].split()[0]
            subject_id = fields[9].split()[0]
            tax = id_to_tax[subject_id]
            try:
                results[query_id].append(tax)
            except KeyError:
                results[query_id] = [tax]
        elif line.startswith('N'):
            fields = line.split('\t')
            query_id = fields[8].split()[0]
            try:
                results[query_id].append(["None"])
            except KeyError:
                results[query_id] = [["None"]]
    return results

 def uc_to_assignment(uc,
                      id_to_tax,
                      num_levels=7):
    results = uc_to_assignments(uc,id_to_tax)
    for query_id, all_assignments in results.items():
        consensus_assignment = get_consensus_assignment(all_assignments)
        results[query_id] = consensus_assignment, len(all_assignments)
    return results

 def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    min_confidence = opts.min_confidence
    output_f = open(opts.output_fp,'w')
    
    if opts.input_fp.endswith('.bl6'):
        result = bl6_to_assignment(open(opts.input_fp,'U'),
                                   parse_id_to_tax(open(opts.taxonomy_map_fp,'U')))
    elif opts.input_fp.endswith('.uc'):
        result = uc_to_assignment(open(opts.input_fp,'U'),
                                   parse_id_to_tax(open(opts.taxonomy_map_fp,'U')))
    else:
        option_parser.error("--input_fp must end with either .uc or .bl6")
    
    output_f.write("#OTU ID\ttaxonomy\tconfidence\tn\n")
    for id_, r in result.items():
        output_confidence = 0.0
        output_tax = []
        for tax, conf in r[0]:
            if conf >= min_confidence:
                output_tax.append(tax)
                output_confidence = conf
            else:
                break
        output_tax = ';'.join(output_tax)
        output_f.write('\t'.join([id_,output_tax,str(output_confidence),str(r[1])]))
        output_f.write('\n')
    output_f.close()
    


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	from os.path import join

	query_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/rep_set.fna"
	reference_seqs_fp = "/data/gg_13_5_otus/rep_set/97_otus.fasta"
	reference_tax_fp = "/data/gg_13_5_otus/taxonomy/97_otu_taxonomy.txt"
	input_biom_fp = "/home/caporaso/analysis/short-read-tax-assignment/data/qiime-mock-community/S16S-2/otu_table_mc2_no_pynast_failures.biom"
	output_biom_fn = "otu_table_mc2_no_pynast_failures_w_taxa.biom"
	output_dir = "/home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/usearch_v_97/"
	uc_to_assignments_cmd = "python /home/caporaso/analysis/short-read-tax-assignment/demo/eval-demo/uc_to_assignments/uc_to_assignments.py"

	f = open('usearch_cmds.sh','w')
	f.write('#!/usr/bin/env bash\n')
	f.write('\n')

	for evalue in [1e0, 1e-2, 1e-4, 1e-6, 1e-8, 1e-10]:
	for queryfract in [0.25, 0.50, 0.75, 1.0]:
	for maxaccepts in [1, 10, 50, 100]:
	for confidence in [0.50, 0.75, 1.0]:
	run_dir = '%s/usearch_%e.%1.1e.%d.%1.1e/' % (output_dir, evalue, queryfract, maxaccepts, confidence)
	uc_fp = join(run_dir, 'results.uc')
	taxmap_fp = join(run_dir,'rep_set_tax_assignments.txt')
	output_biom_fp = join(run_dir, output_biom_fn)
	mkdir_cmd = "mkdir %s" % run_dir
	uc_cmd = "usearch -query %s -db %s -evalue %f -uc %s -queryfract %f -maxaccepts %d" % (query_fp, reference_seqs_fp, evalue, uc_fp, queryfract, maxaccepts)
	taxmap_cmd = "%s -i %s -t %s -o %s -c %s" % (uc_to_assignments_cmd, uc_fp, reference_tax_fp, taxmap_fp, confidence)
	biom_cmd = "add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy" % (input_biom_fp, taxmap_fp, output_biom_fp)
	summ_cmd = "summarize_taxa.py -i %s -o %s" % (output_biom_fp, run_dir)
	f.write('; '.join([mkdir_cmd, uc_cmd, taxmap_cmd, biom_cmd, summ_cmd]))
	f.write('\n')
	f.close()
	#!/usr/bin/env python
	# File created on 25 Jul 2013
	from __future__ import division

	__author__ = "Greg Caporaso"
	__copyright__ = "Copyright 2011, The QIIME project"
	__credits__ = ["Greg Caporaso"]
	__license__ = "GPL"
	__version__ = "1.7.0-dev"
	__maintainer__ = "Greg Caporaso"
	__email__ = "[email protected]"
	__status__ = "Development"


	from shutil import rmtree
	from os.path import exists, join
	from cogent.util.unit_test import TestCase, main
	from cogent.util.misc import remove_files, create_dir
	from qiime.util import (get_qiime_temp_dir,
	get_tmp_filename)
	from qiime.test import initiate_timeout, disable_timeout
	from uc_to_assignments import (bl6_to_assignment,
	bl6_to_assignments,
	get_consensus_assignment,
	parse_id_to_tax)

	class Bl6ToTaxTests(TestCase):

	def setUp(self):

	self.bl61 = bl61.split('\n')
	self.bl62 = bl62.split('\n')
	self.id_to_tax1 = parse_id_to_tax(id_to_tax1.split('\n'))

	def test_get_consensus_assignment(self):
	""" """
	in1 = [['Ab','Bc','De'],
	['Ab','Bc','Fg','Hi'],
	['Ab','Bc','Fg','Jk']]
	expected = [('Ab',1.0),
	('Bc',1.0),
	('Fg',2./3.),
	(None,0),
	(None,0),
	(None,0),
	(None,0)]
	self.assertEqual(get_consensus_assignment(in1,num_levels=7),
	expected)

	def test_bl6_to_assignment_non_redundant(self):
	""" bl6_to_tax_assignments handles non-redundant data as expected """
	expected = {'q1':([('A',1.0),('F',1.0),('G',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
	'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
	'q3':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],3)}
	actual = bl6_to_assignment(self.bl61,self.id_to_tax1)
	self.assertEqual(actual,expected)

	def test_bl6_to_assignment_redundant(self):
	""" bl6_to_tax_assignments handles non-redundant data as expected """
	expected = {'q1':([('A',1.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0),(None,0.0)],2),
	'q2':([('A',1.0),('H',1.0),('I',1.0),('J',1.0),(None,0.0),(None,0.0),(None,0.0)],1),
	'q3':([('A',1.0),('H',1.0),('I',2./3.),('J',2./3.),(None,0.0),(None,0.0),(None,0.0)],3)}
	actual = bl6_to_assignment(self.bl62,self.id_to_tax1)
	self.assertEqual(actual,expected)

	def test_bl6_to_assignments(self):
	""" """
	expected = {'q1':[['A','F','G'],['A','B','C','D']],
	'q2':[['A','H','I','J']],
	'q3':[['A','H','I','J'],['A','H','K','L','M'],['A','H','I','J']]}
	actual = bl6_to_assignments(self.bl62,self.id_to_tax1)
	self.assertEqual(actual,expected)

	bl61 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86
	q1 r1 86.0 50 7 0 1182 1231 747 796 5.6e-10 55
	q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33
	q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60
	q3 r3 86.3 51 7 0 235 285 143 193 1.5e-10 57
	q3 r3 88.0 50 6 0 875 924 747 796 1.1e-11 60
	"""

	bl62 = """q1 r1 88.6 70 8 0 1055 1124 616 685 2e-19 86
	q1 r2 86.0 50 7 0 1182 1231 747 796 5.6e-10 55
	q2 r3 77.4 53 12 0 578 630 156 208 0.0026 33
	q3 r3 84.7 59 9 0 760 818 627 685 1.1e-11 60
	q3 r5 86.3 51 7 0 235 285 143 193 1.5e-10 57
	q3 r6 88.0 50 6 0 875 924 747 796 1.1e-11 60
	"""

	id_to_tax1 = """r1 A;F;G
	r2 A;B;C;D
	r3 A;H;I;J
	r4 A;B;C;E
	r5 A;H;K;L;M
	r6 A;H;I;J
	"""




	if __name__ == "__main__":
	main()