Skip to content

Instantly share code, notes, and snippets.

@tsibley
Last active January 5, 2023 00:11
Show Gist options
  • Save tsibley/8b55c1b721b382cab4c6276442528778 to your computer and use it in GitHub Desktop.
Save tsibley/8b55c1b721b382cab4c6276442528778 to your computer and use it in GitHub Desktop.
escaping.diff
diff --git a/augur/tree.py b/augur/tree.py
index 820e653f..462e6135 100644
--- a/augur/tree.py
+++ b/augur/tree.py
@@ -3,6 +3,7 @@ Build a tree using a variety of methods.
"""
import os
+import re
import shlex
import shutil
import sys
@@ -224,6 +225,28 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'}
reverse_escape_dict = {v:k for k,v in escape_dict.items()}
+ # IQ-TREE uses the POSIX isalnum() function, which is dependent on the
+ # current locale. We achieve parity with that by using Python's re.LOCALE
+ # flag with a byte (instead of str) pattern that uses \w.
+ #
+ # See IQ-TREE function renameBool: https://github.com/iqtree/iqtree2/blob/3bbc304263cb2f85574a9163e8f2e5c5b597a147/utils/tools.cpp#L585
+ #
+ # Note that this considers [/|] unsafe as well, even though IQ-TREE accepts
+ # them as-is.
+ unsafe_chars = re.compile(rb'[^\w.-]', re.LOCALE)
+
+ def escaper(match) -> bytes:
+ char = match[0].decode("utf-8")
+ string = match.string.decode("utf-8")
+
+ # chars not in escape_dict might not be properly handled in treetime
+ if char not in escape_dict:
+ print(f"WARNING: Potentially offending character {char!r} detected in taxon name {string!r}. "
+ f"We recommend replacing offending characters with '_' in the alignment file to avoid issues downstream.")
+ escape_dict[char] = f'_{prefix}-{random_string(20)}_'
+ reverse_escape_dict[escape_dict[char]] = char
+
+ return escape_dict[char].encode("utf-8")
# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta")
@@ -231,13 +254,21 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
num_seqs = 0
with open(tmp_aln_file, 'w', encoding='utf-8') as ofile, open(aln_file, encoding='utf-8') as ifile:
for line in ifile:
- tmp_line = line
if line.startswith(">"):
num_seqs += 1
- for c,v in escape_dict.items():
- tmp_line = tmp_line.replace(c,v)
- ofile.write(tmp_line)
+ # Escape unsafe chars only in the id part of the defline;
+ # IQ-TREE doesn't care about the rest of the defline (the
+ # description part).
+ defline = re.split(r'(\s+)', line[1:], maxsplit=1)
+ try:
+ id, ws, desc = defline
+ except ValueError:
+ id, ws, desc = defline, "", ""
+
+ line = ">" + unsafe_chars.sub(escaper, id.encode("utf-8")).decode("utf-8") + ws + desc
+
+ ofile.write(line)
# Check tree builder arguments for conflicts with hardcoded defaults.
check_conflicting_args(tree_builder_args, ("-ntmax", "-s", "-m"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment