Created
January 5, 2023 00:07
-
-
Save tsibley/40204dad9405d948ebc47b90921942a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Build a tree using a variety of methods. | |
""" | |
import os | |
import re | |
import shlex | |
import shutil | |
import sys | |
import time | |
import uuid | |
import Bio | |
from Bio.Seq import MutableSeq | |
from Bio import Phylo | |
import numpy as np | |
from treetime.vcf_utils import read_vcf | |
from pathlib import Path | |
from .io import read_sequences, run_shell_command, shquote | |
from .utils import nthreads_value, load_mask_sites | |
DEFAULT_ARGS = { | |
"fasttree": "-nt -nosupport", | |
"raxml": "-f d -m GTRCAT -c 25 -p 235813", | |
# For compat with older versions of iqtree, we avoid the newish -fast | |
# option alias and instead spell out its component parts: | |
# | |
# -ninit 2 | |
# -n 2 | |
# -me 0.05 | |
# | |
# This may need to be updated in the future if we want to stay in lock-step | |
# with -fast, although there's probably no particular reason we have to. | |
# Refer to the handling of -fast in utils/tools.cpp: | |
# https://github.com/Cibiv/IQ-TREE/blob/44753aba/utils/tools.cpp#L2926-L2936 | |
# Increasing threads (nt) can cause IQtree to run longer, hence use AUTO by default | |
# Auto makes IQtree chose the optimal number of threads | |
# Redo prevents IQtree errors when a run was aborted and restarted | |
"iqtree": "-ninit 2 -n 2 -me 0.05 -nt AUTO -redo", | |
} | |
class ConflictingArgumentsException(Exception): | |
"""Exception when user-provided tree builder arguments conflict with the | |
requested tree builder's hardcoded defaults (e.g., the path to the | |
alignment, etc.). | |
""" | |
pass | |
def check_conflicting_args(tree_builder_args, defaults): | |
"""Checks the given user-provided tree builder arguments for hardcoded default | |
arguments and raise an exception with a list of any that are found. | |
Arguments | |
--------- | |
tree_builder_args : str | |
User-provided tree builder arguments | |
defaults : list or tuple | |
List of hardcoded default arguments (e.g., ['-nt']) | |
Raises | |
------ | |
ConflictingArgumentsException | |
When any user-provided arguments match those in the defaults. | |
>>> defaults = ("-ntmax", "-m", "-s") | |
>>> check_conflicting_args("-czb -n 2", defaults) | |
>>> check_conflicting_args("-czb -ntmax 2", defaults) | |
Traceback (most recent call last): | |
... | |
augur.tree.ConflictingArgumentsException: The following tree builder arguments conflict with hardcoded defaults. Remove these arguments and try again: -ntmax | |
""" | |
# Parse tree builder argument string into a list of shell arguments. This | |
# allows us to search a list of arguments instead of a string where we might | |
# find partial prefix matches to some of the given defaults. | |
tree_builder_args_list = shlex.split(tree_builder_args) | |
conflicting_args = [ | |
default | |
for default in defaults | |
if default in tree_builder_args_list | |
] | |
if len(conflicting_args) > 0: | |
raise ConflictingArgumentsException( | |
f"The following tree builder arguments conflict with hardcoded defaults. Remove these arguments and try again: {', '.join(conflicting_args)}" | |
) | |
def find_executable(names, default = None): | |
""" | |
Return the path to the first executable found in PATH from the given list | |
of names. | |
Raises a (hopefully helpful) error if no executable is found. Provide a | |
value for the "default" parameter to instead return a value. | |
""" | |
exe = next(filter(shutil.which, names), default) | |
if exe is None: | |
print("Unable to find any of %s in PATH=%s" % (names, os.environ["PATH"])) | |
print("\nHint: You can install the missing program using conda or homebrew or apt-get.\n") | |
raise Exception | |
return exe | |
def build_raxml(aln_file, out_file, clean_up=True, nthreads=1, tree_builder_args=None): | |
''' | |
build tree using RAxML with parameters '-f d -m GTRCAT -c 25 -p 235813 -n tre" | |
''' | |
raxml = find_executable([ | |
# Users who symlink/install as "raxml" can pick a specific version, | |
# otherwise we have our own search order based on expected parallelism. | |
# The variants we look for are not exhaustive, but based on what's | |
# provided by conda and Ubuntu's raxml packages. This may want to be | |
# adjusted in the future depending on use. | |
"raxml", | |
"raxmlHPC-PTHREADS-AVX2", | |
"raxmlHPC-PTHREADS-AVX", | |
"raxmlHPC-PTHREADS-SSE3", | |
"raxmlHPC-PTHREADS", | |
"raxmlHPC-AVX2", | |
"raxmlHPC-AVX", | |
"raxmlHPC-SSE3", | |
"raxmlHPC", | |
]) | |
# RAxML outputs files appended with this random string: | |
# RAxML_bestTree.4ed91a, RAxML_info.4ed91a, RAxML_parsimonyTree.4ed91a, RAxML_result.4ed91a | |
random_string = uuid.uuid4().hex[0:6] | |
# Check tree builder arguments for conflicts with hardcoded defaults. | |
check_conflicting_args(tree_builder_args, ("-T", "-n", "-s")) | |
call = [raxml,"-T",str(nthreads), "-n %s -s"%(random_string), shquote(aln_file), tree_builder_args, "> RAxML_log.%s"%(random_string)] | |
cmd = " ".join(call) | |
print("Building a tree via:\n\t" + cmd + | |
"\n\tStamatakis, A: RAxML Version 8: A tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies." | |
"\n\tIn Bioinformatics, 2014\n") | |
try: | |
run_shell_command(cmd, raise_errors = True) | |
shutil.copy("RAxML_bestTree.%s"%(random_string), out_file) | |
T = Phylo.read(out_file, 'newick') | |
if clean_up: | |
os.remove("RAxML_bestTree.%s"%(random_string)) | |
os.remove("RAxML_info.%s"%(random_string)) | |
os.remove("RAxML_parsimonyTree.%s"%(random_string)) | |
os.remove("RAxML_result.%s"%(random_string)) | |
except: | |
print("ERROR: TREE BUILDING FAILED") | |
if os.path.isfile("RAxML_log.%s"%(random_string)): | |
print("Please see the log file for more details: {}".format("RAxML_log.%s"%(random_string))) | |
T=None | |
return T | |
def build_fasttree(aln_file, out_file, clean_up=True, nthreads=1, tree_builder_args=None): | |
''' | |
build tree using fasttree with parameters "-nt" | |
''' | |
log_file = out_file + ".log" | |
fasttree = find_executable([ | |
# Search order is based on expected parallelism and accuracy | |
# (double-precision versions). | |
"FastTreeDblMP", | |
"FastTreeDbl", | |
"FastTreeMP", | |
"fasttreeMP", # Ubuntu lowercases | |
"FastTree", | |
"fasttree" | |
]) | |
# By default FastTree with OpenMP support uses all available cores. | |
# However, it respects the standard OpenMP environment variable controlling | |
# this as described at <http://www.microbesonline.org/fasttree/#OpenMP>. | |
# | |
# We always set it, regardless of it the found FastTree executable contains | |
# "MP" in the name, because the generic "FastTree" or "fasttree" variants | |
# might be OpenMP-enabled too. | |
extra_env = { | |
"OMP_NUM_THREADS": str(nthreads), | |
} | |
call = [fasttree, tree_builder_args, shquote(aln_file), "1>", shquote(out_file), "2>", shquote(log_file)] | |
cmd = " ".join(call) | |
print("Building a tree via:\n\t" + cmd + | |
"\n\tPrice et al: FastTree 2 - Approximately Maximum-Likelihood Trees for Large Alignments." + | |
"\n\tPLoS ONE 5(3): e9490. https://doi.org/10.1371/journal.pone.0009490\n") | |
try: | |
run_shell_command(cmd, raise_errors = True, extra_env = extra_env) | |
T = Phylo.read(out_file, 'newick') | |
except: | |
print("ERROR: TREE BUILDING FAILED") | |
if os.path.isfile(log_file): | |
print("Please see the log file for more details: {}".format(log_file)) | |
T=None | |
return T | |
def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nthreads=1, tree_builder_args=None): | |
''' | |
build tree using IQ-Tree with parameters "-fast" | |
arguments: | |
aln_file file name of input aligment | |
out_file file name to write tree to | |
''' | |
iqtree = find_executable([ | |
"iqtree2", | |
"iqtree" | |
]) | |
# create a dictionary for characters that IQ-tree changes. | |
# we remove those prior to tree-building and reinstantiate later | |
def random_string(n): | |
from string import ascii_uppercase as letters | |
return "".join([letters[i] for i in np.random.randint(len(letters), size=n)]) | |
prefix = "DELIM" | |
escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'} | |
reverse_escape_dict = {v:k for k,v in escape_dict.items()} | |
# IQ-TREE uses the POSIX isalnum() function, which is dependent on the | |
# current locale. We achieve parity with that by using Python's re.LOCALE | |
# flag with a byte (instead of str) pattern that uses \w. | |
# | |
# See IQ-TREE function renameBool: https://github.com/iqtree/iqtree2/blob/3bbc304263cb2f85574a9163e8f2e5c5b597a147/utils/tools.cpp#L585 | |
# | |
# Note that this considers [/|] unsafe as well, even though IQ-TREE accepts | |
# them as-is. | |
unsafe_chars = re.compile(rb'[^\w.-]', re.LOCALE) | |
def escaper(match) -> bytes: | |
char = match[0].decode("utf-8") | |
string = match.string.decode("utf-8") | |
# chars not in escape_dict might not be properly handled in treetime | |
if char not in escape_dict: | |
print(f"WARNING: Potentially offending character {char!r} detected in taxon name {string!r}. " | |
f"We recommend replacing offending characters with '_' in the alignment file to avoid issues downstream.") | |
escape_dict[char] = f'_{prefix}-{random_string(20)}_' | |
reverse_escape_dict[escape_dict[char]] = char | |
return escape_dict[char].encode("utf-8") | |
# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later | |
tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta") | |
log_file = tmp_aln_file.replace(".fasta", ".iqtree.log") | |
num_seqs = 0 | |
with open(tmp_aln_file, 'w', encoding='utf-8') as ofile, open(aln_file, encoding='utf-8') as ifile: | |
for line in ifile: | |
if line.startswith(">"): | |
num_seqs += 1 | |
defline = re.split(r'(\s+)', line[1:], maxsplit=1) | |
try: | |
id, ws, desc = defline | |
except ValueError: | |
id, ws, desc = defline, "", "" | |
line = ">" + unsafe_chars.sub(escaper, id.encode("utf-8")).decode("utf-8") + ws + desc | |
ofile.write(line) | |
# Check tree builder arguments for conflicts with hardcoded defaults. | |
check_conflicting_args(tree_builder_args, ("-ntmax", "-s", "-m")) | |
if substitution_model.lower() != "auto": | |
call = [iqtree, "-ntmax", str(nthreads), "-s", shquote(tmp_aln_file), | |
"-m", substitution_model, tree_builder_args, ">", log_file] | |
else: | |
call = [iqtree, "-ntmax", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)] | |
cmd = " ".join(call) | |
print("Building a tree via:\n\t" + cmd + | |
"\n\tNguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies." | |
"\n\tMol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300\n") | |
if substitution_model.lower() == "auto": | |
print(f"Conducting a model test... see '{shquote(log_file)}' for the result. You can specify this with --substitution-model in future runs.") | |
try: | |
run_shell_command(cmd, raise_errors = True) | |
T = Phylo.read(tmp_aln_file+".treefile", 'newick') | |
shutil.copyfile(tmp_aln_file+".treefile", out_file) | |
for n in T.find_clades(terminal=True): | |
tmp_name = n.name | |
for v,c in reverse_escape_dict.items(): | |
tmp_name = tmp_name.replace(v,c) | |
n.name = tmp_name | |
#this allows the user to check intermediate output, as tree.nwk will be | |
if clean_up: | |
if os.path.isfile(tmp_aln_file): | |
os.remove(tmp_aln_file) | |
for ext in [".bionj",".ckp.gz",".iqtree",".mldist",".model.gz",".treefile",".uniqueseq.phy",".model"]: | |
if os.path.isfile(tmp_aln_file + ext): | |
os.remove(tmp_aln_file + ext) | |
except: | |
print("ERROR: TREE BUILDING FAILED") | |
if os.path.isfile(log_file): | |
print("Please see the log file for more details: {}".format(log_file)) | |
T=None | |
return T | |
def write_out_informative_fasta(compress_seq, alignment, stripFile=None): | |
from Bio import SeqIO | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Seq import Seq | |
sequences = compress_seq['sequences'] | |
ref = compress_seq['reference'] | |
positions = compress_seq['positions'] | |
#If want to exclude sites from initial treebuild, read in here | |
strip_pos = load_mask_sites(stripFile) if stripFile else [] | |
#Get sequence names | |
seqNames = list(sequences.keys()) | |
#Check non-ref sites to see if informative | |
printPositionMap = False #If true, prints file mapping Fasta position to real position | |
sites = [] | |
pos = [] | |
for key in positions: | |
if key not in strip_pos: | |
pattern = [] | |
for k in sequences.keys(): | |
#looping try/except is faster than list comprehension | |
try: | |
pattern.append(sequences[k][key]) | |
except KeyError: | |
pattern.append(ref[key]) | |
origPattern = list(pattern) | |
if '-' in pattern or 'N' in pattern: | |
#remove gaps/Ns to see if otherwise informative | |
pattern = [value for value in origPattern if value != '-' and value != 'N'] | |
un = np.unique(pattern, return_counts=True) | |
#If not all - or N, not all same base, and >1 differing base, append | |
if len(un[0])!=0 and len(un[0])!=1 and not (len(un[0])==2 and min(un[1])==1): | |
sites.append(origPattern) | |
pos.append("\t".join([str(len(pos)+1),str(key)])) | |
#Rotate and convert to SeqRecord | |
sites = np.asarray(sites) | |
if len(sites.shape)!=2: | |
print("ERROR: NO VALID SITES REMAIN AFTER IGNORING UNCALLED SITES") | |
raise Exception | |
align = np.rot90(sites) | |
seqNamesCorr = list(reversed(seqNames)) | |
toFasta = [ SeqRecord(id=seqNamesCorr[i], seq=Seq("".join(align[i])), description='') for i in range(len(sequences.keys()))] | |
fasta_file = os.path.join(os.path.dirname(alignment), 'informative_sites.fasta') | |
#now output this as fasta to read into raxml or iqtree | |
SeqIO.write(toFasta, fasta_file, 'fasta') | |
#If want a position map, print: | |
if printPositionMap: | |
with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file: | |
the_file.write("\n".join(pos)) | |
return fasta_file | |
def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_file): | |
"""Creates a new multiple sequence alignment FASTA file from which the given | |
excluded sites have been removed and returns the filename of the new | |
alignment. | |
Parameters | |
---------- | |
alignment_file : str | |
path to the original multiple sequence alignment file | |
excluded_sites_file : str | |
path to a text file containing each nucleotide position to exclude with one position per line | |
Returns | |
------- | |
str | |
path to the new FASTA file from which sites have been excluded | |
""" | |
# Read 1-based excluded sites and store as 0-based sites. | |
excluded_sites = load_mask_sites(excluded_sites_file) | |
# Return the original alignment file, if no excluded sites were found. | |
if len(excluded_sites) == 0: | |
return alignment_file | |
# Load alignment as FASTA generator to prevent loading the whole alignment | |
# into memory. | |
alignment = read_sequences(alignment_file) | |
# Write the masked alignment to disk one record at a time. | |
alignment_file_path = Path(alignment_file) | |
masked_alignment_file = str(alignment_file_path.parent / ("masked_%s" % alignment_file_path.name)) | |
with open(masked_alignment_file, "w", encoding='utf-8') as oh: | |
for record in alignment: | |
# Convert to a mutable sequence to enable masking with Ns. | |
sequence = MutableSeq(str(record.seq)) | |
# Replace all excluded sites with Ns. | |
for site in excluded_sites: | |
sequence[site] = "N" | |
record.seq = sequence | |
Bio.SeqIO.write(record, oh, "fasta") | |
# Return the new alignment FASTA filename. | |
return masked_alignment_file | |
def register_parser(parent_subparsers): | |
parser = parent_subparsers.add_parser("tree", help=__doc__) | |
parser.add_argument('--alignment', '-a', required=True, help="alignment in fasta or VCF format") | |
parser.add_argument('--method', default='iqtree', choices=["fasttree", "raxml", "iqtree"], help="tree builder to use") | |
parser.add_argument('--output', '-o', type=str, help='file name to write tree to') | |
parser.add_argument('--substitution-model', default="GTR", | |
help='substitution model to use. Specify \'auto\' to run ModelTest. Currently, only available for IQTREE.') | |
parser.add_argument('--nthreads', type=nthreads_value, default=1, | |
help="maximum number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used") | |
parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to') | |
parser.add_argument('--exclude-sites', type=str, help='file name of one-based sites to exclude for raw tree building (BED format in .bed files, second column in tab-delimited files, or one position per line)') | |
parser.add_argument('--tree-builder-args', type=str, help=f"""arguments to pass to the tree builder either augmenting or overriding the default arguments (except for input alignment path, number of threads, and substitution model). | |
Use the assignment operator (e.g., --tree-builder-args="-czb" for IQ-TREE) to avoid unexpected errors. | |
FastTree defaults: "{DEFAULT_ARGS['fasttree']}". | |
RAxML defaults: "{DEFAULT_ARGS['raxml']}". | |
IQ-TREE defaults: "{DEFAULT_ARGS['iqtree']}". | |
""") | |
parser.add_argument('--override-default-args', action="store_true", help="override default tree builder arguments with the values provided by the user in `--tree-builder-args` instead of augmenting the existing defaults.") | |
parser.epilog = """For example, to build a tree with IQ-TREE, use the following format: | |
augur tree --method iqtree --alignment <alignment> --substitution-model <model> --output <tree> --tree-builder-args="<extra arguments>" | |
""" | |
return parser | |
def run(args): | |
# check alignment type, set flags, read in if VCF | |
is_vcf = False | |
ref = None | |
if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): | |
# Prepare a multiple sequence alignment from the given variants VCF and | |
# reference FASTA. | |
if not args.vcf_reference: | |
print("ERROR: a reference Fasta is required with VCF-format alignments") | |
return 1 | |
compress_seq = read_vcf(args.alignment, args.vcf_reference) | |
sequences = compress_seq['sequences'] | |
ref = compress_seq['reference'] | |
is_vcf = True | |
aln = sequences | |
elif args.exclude_sites: | |
# Mask excluded sites from the given multiple sequence alignment. | |
aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites) | |
else: | |
# Use the multiple sequence alignment as is. | |
aln = args.alignment | |
start = time.time() | |
if args.output: | |
tree_fname = args.output | |
else: | |
tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk' | |
# construct reduced alignment if needed | |
if is_vcf: | |
variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites) | |
fasta = variable_fasta | |
else: | |
fasta = aln | |
if args.substitution_model and not args.method=='iqtree': | |
print("Cannot specify model unless using IQTree. Model specification ignored.") | |
# Allow users to keep default args, override them, or augment them. | |
if args.tree_builder_args is None: | |
tree_builder_args = DEFAULT_ARGS[args.method] | |
elif args.override_default_args: | |
tree_builder_args = args.tree_builder_args | |
else: | |
tree_builder_args = f"{DEFAULT_ARGS[args.method]} {args.tree_builder_args}" | |
try: | |
if args.method=='raxml': | |
T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) | |
elif args.method=='iqtree': | |
T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=tree_builder_args) | |
elif args.method=='fasttree': | |
T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=tree_builder_args) | |
except ConflictingArgumentsException as error: | |
print(f"ERROR:", error, file=sys.stderr) | |
return 1 | |
end = time.time() | |
print("\nBuilding original tree took {} seconds".format(str(end-start))) | |
if T: | |
import json | |
tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') | |
else: | |
return 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment