gregcaporaso · August 29, 2015 13:59
diff --git a/README.md b/README.md
diff --git a/assign-taxonomy-sortmerna.py b/assign-taxonomy-sortmerna.py
 #!/usr/bin/env python
 # File created on 11 Apr 2014
 from __future__ import division

 __author__ = "Greg Caporaso"
 __copyright__ = "Copyright 2014, The QIIME Project"
 __credits__ = ["Greg Caporaso"]
 __license__ = "GPL"
 __version__ = "1.8.0-dev"
 __maintainer__ = "Greg Caporaso"
 __email__ = "[email protected]"

 from tempfile import mkdtemp
 from collections import defaultdict, Counter
 from os.path import join
 from shutil import rmtree

 from skbio.parse.sequences import parse_fasta
 from qiime.util import (parse_command_line_parameters, make_option,
                        load_qiime_config, get_qiime_temp_dir,
                        qiime_system_call, create_dir, remove_files)
 from qiime.assign_taxonomy import TaxonAssigner

 qiime_config = load_qiime_config()

 default_reference_seqs_fp = qiime_config['assign_taxonomy_reference_seqs_fp']
 default_id_to_taxonomy_fp = qiime_config['assign_taxonomy_id_to_taxonomy_fp']

 script_info = {}
 script_info['brief_description'] = ""
 script_info['script_description'] = ""
 # Members of the tuple in script_usage are (title, description, example call)
 script_info['script_usage'] = [("","","")]
 script_info['output_description']= ""
 script_info['required_options'] = [
    make_option('-i', '--input_query_fp', type='existing_filepath',
                help='the query sequences'),
    make_option('-o', '--output_dir', type='new_dirpath',
                help='directory to store output and log files')
 ]
 script_info['optional_options'] = [
     make_option('-t', '--id_to_taxonomy_fp', type="existing_filepath",
                 help='Path to tab-delimited file mapping sequences to their '
                 'taxonomy. Each assigned taxonomy is provided as a '
                 'semicolon-separated string. [default: %s]'
                 % default_id_to_taxonomy_fp,
                 default=default_id_to_taxonomy_fp),
     make_option('-r', '--reference_seqs_fp', type="existing_filepath",
                 help='Path to reference sequences.  These '
                 'are indexed to create the reference database. '
                 '[default: %s; REQUIRED if -b is not provided]'
                 % default_reference_seqs_fp,
                 default=default_reference_seqs_fp),
     make_option('-b', '--reference_seqs_idx_fp',
                 help='Path to pre-indexed reference sequences.  These '
                 'are indexed to create the reference database. '
                 '[default: computed on-the-fly]', default=None),
     make_option('--sortmerna_params', default=None,
                 help='string of parameters to pass to sortmerna'
                 ' [default: no additional parameters as passed]'),
     make_option('--indexdb_params', default=None,
                 help='string of parameters to pass to indexdb_rna'
                 ' [default: no additional parameters as passed]'),
     make_option('--min_consensus_fraction', default=0.51, type=float,
                 help='Minimum fraction of database hits that must have a '
                 'specific taxonomic assignment to assign that taxonomy '
                 'to a query [default: %default]'),
     make_option('--min_percent_id', default=0.0, type=float,
                 help='Minimum percent identity to consider an alignment '
                 'to be a hit [default: all alignments are considered hits]'),
     make_option('--best', default=None, type=int,
                 help='sortmerna\'s --best parameter [default: %default]'),
     make_option('--num_alignments', default=None, type=int,
                 help='sortmerna\'s --num_alignments parameter [default: %default]'),
     make_option('--min_lis', default=2, type=int,
                 help='sortmerna\'s min_lis parameter [default: %default]'),
 ]
 script_info['version'] = __version__

 def call_cmd(cmd, HALT_EXEC):
    if HALT_EXEC:
        print cmd
        exit(0)
    else:
        stdout, stderr, exit_status = qiime_system_call(cmd)
        if exit_status != 0:
            print "indexdb_rna failed!\nSTDOUT\n%s\nSTDERR\n%s\n" \
                   % (stdout, stderr)
            exit(1)
    return cmd

 def sortmerna_indexdb(input_fp, output_fp, params="", HALT_EXEC=False):
    """
    """

    cmd = "indexdb_rna --ref %s,%s -v %s" % (input_fp, output_fp, params)
    return call_cmd(cmd, HALT_EXEC)


 def sortmerna_map(query_fp, refseqs_fasta_fp, refseqs_index_fp, blast_output_fp,
                  min_lis=2, best=None, num_alignments=None, params="",
                  HALT_EXEC=False):
    """
    """
    if best is not None:
        params = " ".join([params, "--best", str(best), "--min_lis",
                           str(min_lis)])
    elif num_alignments is not None:
        params = " ".join([params, "--num_alignments", str(num_alignments)])

    cmd = "sortmerna --ref %s,%s --reads %s --blast 1 --aligned %s -v %s" \
          % (refseqs_fasta_fp, refseqs_index_fp, query_fp, blast_output_fp,
             params)
    return call_cmd(cmd, HALT_EXEC)

 def blast_to_tax_assignments(blast_output_f, id_to_taxonomy_map, min_percent_id=0.0):
    """
    """

    result = defaultdict(list)
    for line in blast_output_f:
        fields = line.split('\t')
        subject_id = fields[1]
        percent_id = float(fields[2])
        if percent_id > min_percent_id:
            subject_tax = id_to_taxonomy_map[subject_id]
            result[fields[0]].append([e.strip() for e in subject_tax.split(';')])
    return result

 def tax_assignments_to_consensus_assignments(query_to_assignments,
                                             min_consensus_fraction=0.51):
    """
    """
    for query_id, assignments in query_to_assignments.iteritems():
        # this call will get cleaned up
        consensus_assignment = \
            get_consensus_assignment(assignments,
            min_consensus_fraction=min_consensus_fraction)
        query_to_assignments[query_id] = consensus_assignment

    return query_to_assignments

 def get_consensus_assignment(assignments, unassignable_label='unassigned',
                             min_consensus_fraction=0.51):
    """ compute the consensus assignment from a list of assignments

        This code was pulled almost exactly from QIIME's
         UclustConsensusTaxonAssigner._get_consensus_assignment method.
    """
    num_input_assignments = len(assignments)
    consensus_assignment = []

    # if the assignments don't all have the same number
    # of levels, the resulting assignment will have a max number
    # of levels equal to the number of levels in the assignment
    # with the fewest number of levels. this is to avoid
    # a case where, for example, there are n assignments, one of
    # which has 7 levels, and the other n-1 assignments have 6 levels.
    # A 7th level in the result would be misleading because it
    # would appear to the user as though it was the consensus
    # across all n assignments.
    num_levels = min([len(a) for a in assignments])

    # iterate over the assignment levels
    for level in range(num_levels):
        # count the different taxonomic assignments at the current level.
        # the counts are computed based on the current level and all higher
        # levels to reflect that, for example, 'p__A; c__B; o__C' and
        # 'p__X; c__Y; o__C' represent different taxa at the o__ level (since
        # they are different at the p__ and c__ levels).
        current_level_assignments = \
            Counter([tuple(e[:level + 1]) for e in assignments])
        # identify the most common taxonomic assignment, and compute the
        # fraction of assignments that contained it. it's safe to compute the
        # fraction using num_assignments because the deepest level we'll
        # ever look at here is num_levels (see above comment on how that
        # is decided).
        tax, max_count = current_level_assignments.most_common(1)[0]
        max_consensus_fraction = max_count / num_input_assignments
        # check whether the most common taxonomic assignment is observed
        # in at least min_consensus_fraction of the sequences
        if max_consensus_fraction >= min_consensus_fraction:
            # if so, append the current level only (e.g., 'o__C' if tax is
            # 'p__A; c__B; o__C', and continue on to the next level
            consensus_assignment.append((tax[-1], max_consensus_fraction))
        else:
            # if not, there is no assignment at this level, and we're
            # done iterating over levels
            break

    # construct the results
    # determine the number of levels in the consensus assignment
    consensus_assignment_depth = len(consensus_assignment)
    if consensus_assignment_depth > 0:
        # if it's greater than 0, generate a list of the
        # taxa assignments at each level
        assignment_result = [a[0] for a in consensus_assignment]
        # and assign the consensus_fraction_result as the
        # consensus fraction at the deepest level
        consensus_fraction_result = \
            consensus_assignment[consensus_assignment_depth - 1][1]
    else:
        # if there are zero assignments, indicate that the taxa is
        # unknown
        assignment_result = [unassignable_label]
        # and assign the consensus_fraction_result to 1.0 (this is
        # somewhat arbitrary, but could be interpreted as all of the
        # assignments suggest an unknown taxonomy)
        consensus_fraction_result = 1.0

    return (
        assignment_result, consensus_fraction_result, num_input_assignments
    )

 def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_query_fp = opts.input_query_fp
    id_to_taxonomy_fp = opts.id_to_taxonomy_fp
    reference_seqs_fp = opts.reference_seqs_fp
    reference_seqs_idx_fp = opts.reference_seqs_idx_fp
    output_dir = opts.output_dir
    indexdb_params = opts.indexdb_params or ""
    sortmerna_params = opts.sortmerna_params or ""
    min_lis = opts.min_lis
    best = opts.best
    num_alignments = opts.num_alignments
    min_consensus_fraction = opts.min_consensus_fraction
    min_percent_id = opts.min_percent_id

    if (best is None) and (num_alignments is None):
        option_parser.error("Either --best or --num_alignments must be passed.")
    elif (best is not None) and (num_alignments is not None):
        option_parser.error("--best and --num_alignments cannot both be passed.")
    else:
        pass

    create_dir(output_dir)
    dirs_to_remove = []

    qiime_temp_dir = get_qiime_temp_dir()
    output_fp = join(output_dir, 'assignments.tsv')
    command_log_fp = join(output_dir, 'cmds.log')
    output_f = open(output_fp, 'w')
    command_log_f = open(command_log_fp, 'w')

    id_to_taxonomy_f = open(id_to_taxonomy_fp, 'U')
    id_to_taxonomy_map = \
        TaxonAssigner._parse_id_to_taxonomy_file(id_to_taxonomy_f)
    id_to_taxonomy_f.close()

    if reference_seqs_idx_fp is None:
        index_dir = mkdtemp(dir=qiime_temp_dir)
        create_dir(index_dir)
        dirs_to_remove.append(index_dir)
        reference_seqs_idx_fp = join(index_dir, 'ref.idx')
        cmd = sortmerna_indexdb(reference_seqs_fp, reference_seqs_idx_fp,
                                params=indexdb_params)
        command_log_f.write(cmd)
        command_log_f.write('\n')

    blast_output_basename = join(output_dir, 'log')
    blast_output_fp = '.'.join([blast_output_basename, 'blast'])
    cmd = sortmerna_map(input_query_fp, reference_seqs_fp,
                        reference_seqs_idx_fp, blast_output_basename,
                        min_lis=min_lis, best=best, num_alignments=num_alignments,
                        params=sortmerna_params)
    command_log_f.write(cmd)
    command_log_f.write('\n')

    query_to_assignments = blast_to_tax_assignments(
        open(blast_output_fp), id_to_taxonomy_map, min_percent_id=min_percent_id)
    results = tax_assignments_to_consensus_assignments(
        query_to_assignments, min_consensus_fraction=min_consensus_fraction)
    output_f.write('#OTU ID\ttaxonomy\tconfidence\tnum hits\n')
    # this is ugly... we need results for all input sequences, but if there
    # are no hits from sortmerna for a sequence we won't have it in results.
    # we'll want to find a nicer solution than having to iterate over the input
    # file. maybe sortmerna can create a failures file...
    for query_id, _ in parse_fasta(open(input_query_fp)):
        query_id = query_id.split()[0]
        assignment = results[query_id]
        if len(assignment) == 0:
            assignment = [['Unassigned'], 0.0, 0]
        output_f.write('\t'.join([query_id, '; '.join(assignment[0]),
                                  str(assignment[1]), str(assignment[2])]))
        output_f.write('\n')

    # clean up time...
    output_f.close()
    command_log_f.close()
    map(rmtree, dirs_to_remove)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# File created on 11 Apr 2014
	from __future__ import division

	__author__ = "Greg Caporaso"
	__copyright__ = "Copyright 2014, The QIIME Project"
	__credits__ = ["Greg Caporaso"]
	__license__ = "GPL"
	__version__ = "1.8.0-dev"
	__maintainer__ = "Greg Caporaso"
	__email__ = "[email protected]"

	from tempfile import mkdtemp
	from collections import defaultdict, Counter
	from os.path import join
	from shutil import rmtree

	from skbio.parse.sequences import parse_fasta
	from qiime.util import (parse_command_line_parameters, make_option,
	load_qiime_config, get_qiime_temp_dir,
	qiime_system_call, create_dir, remove_files)
	from qiime.assign_taxonomy import TaxonAssigner

	qiime_config = load_qiime_config()

	default_reference_seqs_fp = qiime_config['assign_taxonomy_reference_seqs_fp']
	default_id_to_taxonomy_fp = qiime_config['assign_taxonomy_id_to_taxonomy_fp']

	script_info = {}
	script_info['brief_description'] = ""
	script_info['script_description'] = ""
	# Members of the tuple in script_usage are (title, description, example call)
	script_info['script_usage'] = [("","","")]
	script_info['output_description']= ""
	script_info['required_options'] = [
	make_option('-i', '--input_query_fp', type='existing_filepath',
	help='the query sequences'),
	make_option('-o', '--output_dir', type='new_dirpath',
	help='directory to store output and log files')
	]
	script_info['optional_options'] = [
	make_option('-t', '--id_to_taxonomy_fp', type="existing_filepath",
	help='Path to tab-delimited file mapping sequences to their '
	'taxonomy. Each assigned taxonomy is provided as a '
	'semicolon-separated string. [default: %s]'
	% default_id_to_taxonomy_fp,
	default=default_id_to_taxonomy_fp),
	make_option('-r', '--reference_seqs_fp', type="existing_filepath",
	help='Path to reference sequences. These '
	'are indexed to create the reference database. '
	'[default: %s; REQUIRED if -b is not provided]'
	% default_reference_seqs_fp,
	default=default_reference_seqs_fp),
	make_option('-b', '--reference_seqs_idx_fp',
	help='Path to pre-indexed reference sequences. These '
	'are indexed to create the reference database. '
	'[default: computed on-the-fly]', default=None),
	make_option('--sortmerna_params', default=None,
	help='string of parameters to pass to sortmerna'
	' [default: no additional parameters as passed]'),
	make_option('--indexdb_params', default=None,
	help='string of parameters to pass to indexdb_rna'
	' [default: no additional parameters as passed]'),
	make_option('--min_consensus_fraction', default=0.51, type=float,
	help='Minimum fraction of database hits that must have a '
	'specific taxonomic assignment to assign that taxonomy '
	'to a query [default: %default]'),
	make_option('--min_percent_id', default=0.0, type=float,
	help='Minimum percent identity to consider an alignment '
	'to be a hit [default: all alignments are considered hits]'),
	make_option('--best', default=None, type=int,
	help='sortmerna\'s --best parameter [default: %default]'),
	make_option('--num_alignments', default=None, type=int,
	help='sortmerna\'s --num_alignments parameter [default: %default]'),
	make_option('--min_lis', default=2, type=int,
	help='sortmerna\'s min_lis parameter [default: %default]'),
	]
	script_info['version'] = __version__

	def call_cmd(cmd, HALT_EXEC):
	if HALT_EXEC:
	print cmd
	exit(0)
	else:
	stdout, stderr, exit_status = qiime_system_call(cmd)
	if exit_status != 0:
	print "indexdb_rna failed!\nSTDOUT\n%s\nSTDERR\n%s\n" \
	% (stdout, stderr)
	exit(1)
	return cmd

	def sortmerna_indexdb(input_fp, output_fp, params="", HALT_EXEC=False):
	"""
	"""

	cmd = "indexdb_rna --ref %s,%s -v %s" % (input_fp, output_fp, params)
	return call_cmd(cmd, HALT_EXEC)


	def sortmerna_map(query_fp, refseqs_fasta_fp, refseqs_index_fp, blast_output_fp,
	min_lis=2, best=None, num_alignments=None, params="",
	HALT_EXEC=False):
	"""
	"""
	if best is not None:
	params = " ".join([params, "--best", str(best), "--min_lis",
	str(min_lis)])
	elif num_alignments is not None:
	params = " ".join([params, "--num_alignments", str(num_alignments)])

	cmd = "sortmerna --ref %s,%s --reads %s --blast 1 --aligned %s -v %s" \
	% (refseqs_fasta_fp, refseqs_index_fp, query_fp, blast_output_fp,
	params)
	return call_cmd(cmd, HALT_EXEC)

	def blast_to_tax_assignments(blast_output_f, id_to_taxonomy_map, min_percent_id=0.0):
	"""
	"""

	result = defaultdict(list)
	for line in blast_output_f:
	fields = line.split('\t')
	subject_id = fields[1]
	percent_id = float(fields[2])
	if percent_id > min_percent_id:
	subject_tax = id_to_taxonomy_map[subject_id]
	result[fields[0]].append([e.strip() for e in subject_tax.split(';')])
	return result

	def tax_assignments_to_consensus_assignments(query_to_assignments,
	min_consensus_fraction=0.51):
	"""
	"""
	for query_id, assignments in query_to_assignments.iteritems():
	# this call will get cleaned up
	consensus_assignment = \
	get_consensus_assignment(assignments,
	min_consensus_fraction=min_consensus_fraction)
	query_to_assignments[query_id] = consensus_assignment

	return query_to_assignments

	def get_consensus_assignment(assignments, unassignable_label='unassigned',
	min_consensus_fraction=0.51):
	""" compute the consensus assignment from a list of assignments

	This code was pulled almost exactly from QIIME's
	UclustConsensusTaxonAssigner._get_consensus_assignment method.
	"""
	num_input_assignments = len(assignments)
	consensus_assignment = []

	# if the assignments don't all have the same number
	# of levels, the resulting assignment will have a max number
	# of levels equal to the number of levels in the assignment
	# with the fewest number of levels. this is to avoid
	# a case where, for example, there are n assignments, one of
	# which has 7 levels, and the other n-1 assignments have 6 levels.
	# A 7th level in the result would be misleading because it
	# would appear to the user as though it was the consensus
	# across all n assignments.
	num_levels = min([len(a) for a in assignments])

	# iterate over the assignment levels
	for level in range(num_levels):
	# count the different taxonomic assignments at the current level.
	# the counts are computed based on the current level and all higher
	# levels to reflect that, for example, 'p__A; c__B; o__C' and
	# 'p__X; c__Y; o__C' represent different taxa at the o__ level (since
	# they are different at the p__ and c__ levels).
	current_level_assignments = \
	Counter([tuple(e[:level + 1]) for e in assignments])
	# identify the most common taxonomic assignment, and compute the
	# fraction of assignments that contained it. it's safe to compute the
	# fraction using num_assignments because the deepest level we'll
	# ever look at here is num_levels (see above comment on how that
	# is decided).
	tax, max_count = current_level_assignments.most_common(1)[0]
	max_consensus_fraction = max_count / num_input_assignments
	# check whether the most common taxonomic assignment is observed
	# in at least min_consensus_fraction of the sequences
	if max_consensus_fraction >= min_consensus_fraction:
	# if so, append the current level only (e.g., 'o__C' if tax is
	# 'p__A; c__B; o__C', and continue on to the next level
	consensus_assignment.append((tax[-1], max_consensus_fraction))
	else:
	# if not, there is no assignment at this level, and we're
	# done iterating over levels
	break

	# construct the results
	# determine the number of levels in the consensus assignment
	consensus_assignment_depth = len(consensus_assignment)
	if consensus_assignment_depth > 0:
	# if it's greater than 0, generate a list of the
	# taxa assignments at each level
	assignment_result = [a[0] for a in consensus_assignment]
	# and assign the consensus_fraction_result as the
	# consensus fraction at the deepest level
	consensus_fraction_result = \
	consensus_assignment[consensus_assignment_depth - 1][1]
	else:
	# if there are zero assignments, indicate that the taxa is
	# unknown
	assignment_result = [unassignable_label]
	# and assign the consensus_fraction_result to 1.0 (this is
	# somewhat arbitrary, but could be interpreted as all of the
	# assignments suggest an unknown taxonomy)
	consensus_fraction_result = 1.0

	return (
	assignment_result, consensus_fraction_result, num_input_assignments
	)

	def main():
	option_parser, opts, args = parse_command_line_parameters(**script_info)
	input_query_fp = opts.input_query_fp
	id_to_taxonomy_fp = opts.id_to_taxonomy_fp
	reference_seqs_fp = opts.reference_seqs_fp
	reference_seqs_idx_fp = opts.reference_seqs_idx_fp
	output_dir = opts.output_dir
	indexdb_params = opts.indexdb_params or ""
	sortmerna_params = opts.sortmerna_params or ""
	min_lis = opts.min_lis
	best = opts.best
	num_alignments = opts.num_alignments
	min_consensus_fraction = opts.min_consensus_fraction
	min_percent_id = opts.min_percent_id

	if (best is None) and (num_alignments is None):
	option_parser.error("Either --best or --num_alignments must be passed.")
	elif (best is not None) and (num_alignments is not None):
	option_parser.error("--best and --num_alignments cannot both be passed.")
	else:
	pass

	create_dir(output_dir)
	dirs_to_remove = []

	qiime_temp_dir = get_qiime_temp_dir()
	output_fp = join(output_dir, 'assignments.tsv')
	command_log_fp = join(output_dir, 'cmds.log')
	output_f = open(output_fp, 'w')
	command_log_f = open(command_log_fp, 'w')

	id_to_taxonomy_f = open(id_to_taxonomy_fp, 'U')
	id_to_taxonomy_map = \
	TaxonAssigner._parse_id_to_taxonomy_file(id_to_taxonomy_f)
	id_to_taxonomy_f.close()

	if reference_seqs_idx_fp is None:
	index_dir = mkdtemp(dir=qiime_temp_dir)
	create_dir(index_dir)
	dirs_to_remove.append(index_dir)
	reference_seqs_idx_fp = join(index_dir, 'ref.idx')
	cmd = sortmerna_indexdb(reference_seqs_fp, reference_seqs_idx_fp,
	params=indexdb_params)
	command_log_f.write(cmd)
	command_log_f.write('\n')

	blast_output_basename = join(output_dir, 'log')
	blast_output_fp = '.'.join([blast_output_basename, 'blast'])
	cmd = sortmerna_map(input_query_fp, reference_seqs_fp,
	reference_seqs_idx_fp, blast_output_basename,
	min_lis=min_lis, best=best, num_alignments=num_alignments,
	params=sortmerna_params)
	command_log_f.write(cmd)
	command_log_f.write('\n')

	query_to_assignments = blast_to_tax_assignments(
	open(blast_output_fp), id_to_taxonomy_map, min_percent_id=min_percent_id)
	results = tax_assignments_to_consensus_assignments(
	query_to_assignments, min_consensus_fraction=min_consensus_fraction)
	output_f.write('#OTU ID\ttaxonomy\tconfidence\tnum hits\n')
	# this is ugly... we need results for all input sequences, but if there
	# are no hits from sortmerna for a sequence we won't have it in results.
	# we'll want to find a nicer solution than having to iterate over the input
	# file. maybe sortmerna can create a failures file...
	for query_id, _ in parse_fasta(open(input_query_fp)):
	query_id = query_id.split()[0]
	assignment = results[query_id]
	if len(assignment) == 0:
	assignment = [['Unassigned'], 0.0, 0]
	output_f.write('\t'.join([query_id, '; '.join(assignment[0]),
	str(assignment[1]), str(assignment[2])]))
	output_f.write('\n')

	# clean up time...
	output_f.close()
	command_log_f.close()
	map(rmtree, dirs_to_remove)

	if __name__ == "__main__":
	main()