Last active
January 5, 2023 00:11
-
-
Save tsibley/8b55c1b721b382cab4c6276442528778 to your computer and use it in GitHub Desktop.
escaping.diff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/augur/tree.py b/augur/tree.py | |
index 820e653f..462e6135 100644 | |
--- a/augur/tree.py | |
+++ b/augur/tree.py | |
@@ -3,6 +3,7 @@ Build a tree using a variety of methods. | |
""" | |
import os | |
+import re | |
import shlex | |
import shutil | |
import sys | |
@@ -224,6 +225,28 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt | |
escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'} | |
reverse_escape_dict = {v:k for k,v in escape_dict.items()} | |
+ # IQ-TREE uses the POSIX isalnum() function, which is dependent on the | |
+ # current locale. We achieve parity with that by using Python's re.LOCALE | |
+ # flag with a byte (instead of str) pattern that uses \w. | |
+ # | |
+ # See IQ-TREE function renameBool: https://github.com/iqtree/iqtree2/blob/3bbc304263cb2f85574a9163e8f2e5c5b597a147/utils/tools.cpp#L585 | |
+ # | |
+ # Note that this considers [/|] unsafe as well, even though IQ-TREE accepts | |
+ # them as-is. | |
+ unsafe_chars = re.compile(rb'[^\w.-]', re.LOCALE) | |
+ | |
+ def escaper(match) -> bytes: | |
+ char = match[0].decode("utf-8") | |
+ string = match.string.decode("utf-8") | |
+ | |
+ # chars not in escape_dict might not be properly handled in treetime | |
+ if char not in escape_dict: | |
+ print(f"WARNING: Potentially offending character {char!r} detected in taxon name {string!r}. " | |
+ f"We recommend replacing offending characters with '_' in the alignment file to avoid issues downstream.") | |
+ escape_dict[char] = f'_{prefix}-{random_string(20)}_' | |
+ reverse_escape_dict[escape_dict[char]] = char | |
+ | |
+ return escape_dict[char].encode("utf-8") | |
# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later | |
tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta") | |
@@ -231,13 +254,21 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt | |
num_seqs = 0 | |
with open(tmp_aln_file, 'w', encoding='utf-8') as ofile, open(aln_file, encoding='utf-8') as ifile: | |
for line in ifile: | |
- tmp_line = line | |
if line.startswith(">"): | |
num_seqs += 1 | |
- for c,v in escape_dict.items(): | |
- tmp_line = tmp_line.replace(c,v) | |
- ofile.write(tmp_line) | |
+ # Escape unsafe chars only in the id part of the defline; | |
+ # IQ-TREE doesn't care about the rest of the defline (the | |
+ # description part). | |
+ defline = re.split(r'(\s+)', line[1:], maxsplit=1) | |
+ try: | |
+ id, ws, desc = defline | |
+ except ValueError: | |
+ id, ws, desc = defline, "", "" | |
+ | |
+ line = ">" + unsafe_chars.sub(escaper, id.encode("utf-8")).decode("utf-8") + ws + desc | |
+ | |
+ ofile.write(line) | |
# Check tree builder arguments for conflicts with hardcoded defaults. | |
check_conflicting_args(tree_builder_args, ("-ntmax", "-s", "-m")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment