Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active August 29, 2015 13:59
Show Gist options
  • Save gregcaporaso/10506167 to your computer and use it in GitHub Desktop.
Save gregcaporaso/10506167 to your computer and use it in GitHub Desktop.
quick-and-dirty sortmerna-based consensus taxonomy assigner

This is a first pass at doing rRNA read assignment with sortmerna. This code is completely untested, but provides the framework for starting to test this as an alternative to QIIME's existing taxonomy assigners.

To use this code you'll need:

  • QIIME 1.8.0 or later
  • scikit-bio (latest GitHub version)
  • The sortmerna binaries in your PATH environment variable.
git clone https://gist.github.com/10506167.git smr-tax
mkdir tiny-test
python -c "from qiime.test import write_test_data ; write_test_data('./tiny-test')"
python smr-tax/assign-taxonomy-sortmerna.py -i tiny-test/seqs -r tiny-test/refseqs -t tiny-test/refseqs_tax -o assigned_tax

The resulting assigned_tax/assignments.tsv will contain the taxonomy assignments, assigned_tax/log.blast will contain the blast-formatted raw output of sortmerna, and assigned_tax/cmds.log will contain the exact sortmerna commands that were run.

By default, the sortmerna binary calls would look like:

indexdb_rna --ref tiny-test/refseqs,/Users/caporaso/temp/tmphbWyNB/ref.idx -v
sortmerna --ref tiny-test/refseqs,/Users/caporaso/temp/tmphbWyNB/ref.idx --reads tiny-test/seqs --blast 1 --aligned assigned_tax/log --min_lis 2 --best 3 -v

If you want to experiment with parameters that are not passed in those commands, you can use the corresponding params parameters. For example:

python smr-tax/assign-taxonomy-sortmerna.py -i tiny-test/seqs -r tiny-test/refseqs -t tiny-test/refseqs_tax -o assigned_tax_ --sortmerna_params "-F" --indexdb_params "-m 4000 -L 19"
#!/usr/bin/env python
# File created on 11 Apr 2014
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2014, The QIIME Project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.8.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
from tempfile import mkdtemp
from collections import defaultdict, Counter
from os.path import join
from shutil import rmtree
from skbio.parse.sequences import parse_fasta
from qiime.util import (parse_command_line_parameters, make_option,
load_qiime_config, get_qiime_temp_dir,
qiime_system_call, create_dir, remove_files)
from qiime.assign_taxonomy import TaxonAssigner
qiime_config = load_qiime_config()
default_reference_seqs_fp = qiime_config['assign_taxonomy_reference_seqs_fp']
default_id_to_taxonomy_fp = qiime_config['assign_taxonomy_id_to_taxonomy_fp']
script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
# Members of the tuple in script_usage are (title, description, example call)
script_info['script_usage'] = [("","","")]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i', '--input_query_fp', type='existing_filepath',
help='the query sequences'),
make_option('-o', '--output_dir', type='new_dirpath',
help='directory to store output and log files')
]
script_info['optional_options'] = [
make_option('-t', '--id_to_taxonomy_fp', type="existing_filepath",
help='Path to tab-delimited file mapping sequences to their '
'taxonomy. Each assigned taxonomy is provided as a '
'semicolon-separated string. [default: %s]'
% default_id_to_taxonomy_fp,
default=default_id_to_taxonomy_fp),
make_option('-r', '--reference_seqs_fp', type="existing_filepath",
help='Path to reference sequences. These '
'are indexed to create the reference database. '
'[default: %s; REQUIRED if -b is not provided]'
% default_reference_seqs_fp,
default=default_reference_seqs_fp),
make_option('-b', '--reference_seqs_idx_fp',
help='Path to pre-indexed reference sequences. These '
'are indexed to create the reference database. '
'[default: computed on-the-fly]', default=None),
make_option('--sortmerna_params', default=None,
help='string of parameters to pass to sortmerna'
' [default: no additional parameters as passed]'),
make_option('--indexdb_params', default=None,
help='string of parameters to pass to indexdb_rna'
' [default: no additional parameters as passed]'),
make_option('--min_consensus_fraction', default=0.51, type=float,
help='Minimum fraction of database hits that must have a '
'specific taxonomic assignment to assign that taxonomy '
'to a query [default: %default]'),
make_option('--min_percent_id', default=0.0, type=float,
help='Minimum percent identity to consider an alignment '
'to be a hit [default: all alignments are considered hits]'),
make_option('--best', default=None, type=int,
help='sortmerna\'s --best parameter [default: %default]'),
make_option('--num_alignments', default=None, type=int,
help='sortmerna\'s --num_alignments parameter [default: %default]'),
make_option('--min_lis', default=2, type=int,
help='sortmerna\'s min_lis parameter [default: %default]'),
]
script_info['version'] = __version__
def call_cmd(cmd, HALT_EXEC):
if HALT_EXEC:
print cmd
exit(0)
else:
stdout, stderr, exit_status = qiime_system_call(cmd)
if exit_status != 0:
print "indexdb_rna failed!\nSTDOUT\n%s\nSTDERR\n%s\n" \
% (stdout, stderr)
exit(1)
return cmd
def sortmerna_indexdb(input_fp, output_fp, params="", HALT_EXEC=False):
"""
"""
cmd = "indexdb_rna --ref %s,%s -v %s" % (input_fp, output_fp, params)
return call_cmd(cmd, HALT_EXEC)
def sortmerna_map(query_fp, refseqs_fasta_fp, refseqs_index_fp, blast_output_fp,
min_lis=2, best=None, num_alignments=None, params="",
HALT_EXEC=False):
"""
"""
if best is not None:
params = " ".join([params, "--best", str(best), "--min_lis",
str(min_lis)])
elif num_alignments is not None:
params = " ".join([params, "--num_alignments", str(num_alignments)])
cmd = "sortmerna --ref %s,%s --reads %s --blast 1 --aligned %s -v %s" \
% (refseqs_fasta_fp, refseqs_index_fp, query_fp, blast_output_fp,
params)
return call_cmd(cmd, HALT_EXEC)
def blast_to_tax_assignments(blast_output_f, id_to_taxonomy_map, min_percent_id=0.0):
"""
"""
result = defaultdict(list)
for line in blast_output_f:
fields = line.split('\t')
subject_id = fields[1]
percent_id = float(fields[2])
if percent_id > min_percent_id:
subject_tax = id_to_taxonomy_map[subject_id]
result[fields[0]].append([e.strip() for e in subject_tax.split(';')])
return result
def tax_assignments_to_consensus_assignments(query_to_assignments,
min_consensus_fraction=0.51):
"""
"""
for query_id, assignments in query_to_assignments.iteritems():
# this call will get cleaned up
consensus_assignment = \
get_consensus_assignment(assignments,
min_consensus_fraction=min_consensus_fraction)
query_to_assignments[query_id] = consensus_assignment
return query_to_assignments
def get_consensus_assignment(assignments, unassignable_label='unassigned',
min_consensus_fraction=0.51):
""" compute the consensus assignment from a list of assignments
This code was pulled almost exactly from QIIME's
UclustConsensusTaxonAssigner._get_consensus_assignment method.
"""
num_input_assignments = len(assignments)
consensus_assignment = []
# if the assignments don't all have the same number
# of levels, the resulting assignment will have a max number
# of levels equal to the number of levels in the assignment
# with the fewest number of levels. this is to avoid
# a case where, for example, there are n assignments, one of
# which has 7 levels, and the other n-1 assignments have 6 levels.
# A 7th level in the result would be misleading because it
# would appear to the user as though it was the consensus
# across all n assignments.
num_levels = min([len(a) for a in assignments])
# iterate over the assignment levels
for level in range(num_levels):
# count the different taxonomic assignments at the current level.
# the counts are computed based on the current level and all higher
# levels to reflect that, for example, 'p__A; c__B; o__C' and
# 'p__X; c__Y; o__C' represent different taxa at the o__ level (since
# they are different at the p__ and c__ levels).
current_level_assignments = \
Counter([tuple(e[:level + 1]) for e in assignments])
# identify the most common taxonomic assignment, and compute the
# fraction of assignments that contained it. it's safe to compute the
# fraction using num_assignments because the deepest level we'll
# ever look at here is num_levels (see above comment on how that
# is decided).
tax, max_count = current_level_assignments.most_common(1)[0]
max_consensus_fraction = max_count / num_input_assignments
# check whether the most common taxonomic assignment is observed
# in at least min_consensus_fraction of the sequences
if max_consensus_fraction >= min_consensus_fraction:
# if so, append the current level only (e.g., 'o__C' if tax is
# 'p__A; c__B; o__C', and continue on to the next level
consensus_assignment.append((tax[-1], max_consensus_fraction))
else:
# if not, there is no assignment at this level, and we're
# done iterating over levels
break
# construct the results
# determine the number of levels in the consensus assignment
consensus_assignment_depth = len(consensus_assignment)
if consensus_assignment_depth > 0:
# if it's greater than 0, generate a list of the
# taxa assignments at each level
assignment_result = [a[0] for a in consensus_assignment]
# and assign the consensus_fraction_result as the
# consensus fraction at the deepest level
consensus_fraction_result = \
consensus_assignment[consensus_assignment_depth - 1][1]
else:
# if there are zero assignments, indicate that the taxa is
# unknown
assignment_result = [unassignable_label]
# and assign the consensus_fraction_result to 1.0 (this is
# somewhat arbitrary, but could be interpreted as all of the
# assignments suggest an unknown taxonomy)
consensus_fraction_result = 1.0
return (
assignment_result, consensus_fraction_result, num_input_assignments
)
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
input_query_fp = opts.input_query_fp
id_to_taxonomy_fp = opts.id_to_taxonomy_fp
reference_seqs_fp = opts.reference_seqs_fp
reference_seqs_idx_fp = opts.reference_seqs_idx_fp
output_dir = opts.output_dir
indexdb_params = opts.indexdb_params or ""
sortmerna_params = opts.sortmerna_params or ""
min_lis = opts.min_lis
best = opts.best
num_alignments = opts.num_alignments
min_consensus_fraction = opts.min_consensus_fraction
min_percent_id = opts.min_percent_id
if (best is None) and (num_alignments is None):
option_parser.error("Either --best or --num_alignments must be passed.")
elif (best is not None) and (num_alignments is not None):
option_parser.error("--best and --num_alignments cannot both be passed.")
else:
pass
create_dir(output_dir)
dirs_to_remove = []
qiime_temp_dir = get_qiime_temp_dir()
output_fp = join(output_dir, 'assignments.tsv')
command_log_fp = join(output_dir, 'cmds.log')
output_f = open(output_fp, 'w')
command_log_f = open(command_log_fp, 'w')
id_to_taxonomy_f = open(id_to_taxonomy_fp, 'U')
id_to_taxonomy_map = \
TaxonAssigner._parse_id_to_taxonomy_file(id_to_taxonomy_f)
id_to_taxonomy_f.close()
if reference_seqs_idx_fp is None:
index_dir = mkdtemp(dir=qiime_temp_dir)
create_dir(index_dir)
dirs_to_remove.append(index_dir)
reference_seqs_idx_fp = join(index_dir, 'ref.idx')
cmd = sortmerna_indexdb(reference_seqs_fp, reference_seqs_idx_fp,
params=indexdb_params)
command_log_f.write(cmd)
command_log_f.write('\n')
blast_output_basename = join(output_dir, 'log')
blast_output_fp = '.'.join([blast_output_basename, 'blast'])
cmd = sortmerna_map(input_query_fp, reference_seqs_fp,
reference_seqs_idx_fp, blast_output_basename,
min_lis=min_lis, best=best, num_alignments=num_alignments,
params=sortmerna_params)
command_log_f.write(cmd)
command_log_f.write('\n')
query_to_assignments = blast_to_tax_assignments(
open(blast_output_fp), id_to_taxonomy_map, min_percent_id=min_percent_id)
results = tax_assignments_to_consensus_assignments(
query_to_assignments, min_consensus_fraction=min_consensus_fraction)
output_f.write('#OTU ID\ttaxonomy\tconfidence\tnum hits\n')
# this is ugly... we need results for all input sequences, but if there
# are no hits from sortmerna for a sequence we won't have it in results.
# we'll want to find a nicer solution than having to iterate over the input
# file. maybe sortmerna can create a failures file...
for query_id, _ in parse_fasta(open(input_query_fp)):
query_id = query_id.split()[0]
assignment = results[query_id]
if len(assignment) == 0:
assignment = [['Unassigned'], 0.0, 0]
output_f.write('\t'.join([query_id, '; '.join(assignment[0]),
str(assignment[1]), str(assignment[2])]))
output_f.write('\n')
# clean up time...
output_f.close()
command_log_f.close()
map(rmtree, dirs_to_remove)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment