Created
December 28, 2019 19:14
-
-
Save mtholder/3588c927224a1ce8184805cd5b2bb8d7 to your computer and use it in GitHub Desktop.
suppresses nodes from an OT tree if they do not have an ID listed in a separate file. Takes newick-tree-filepath and file-with-ids-one-per-line as args
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/env python3 | |
| import sys | |
| import dendropy | |
| import re | |
| id_extractor = re.compile('.*ott([0-9]+)$') | |
| tree_filepath = sys.argv[1] | |
| id_filepath = sys.argv[2] | |
| tree = dendropy.Tree.get(path=tree_filepath, schema='newick', suppress_internal_node_taxa=False) | |
| # Get the list of IDs to retain | |
| trl = [] | |
| for i in open(id_filepath, 'r').readlines(): | |
| if i.lower().startswith('ott'): | |
| i = i[3:] | |
| i = i.strip() | |
| while i.endswith(','): | |
| i = i[:-1] | |
| trl.append(i) | |
| to_retain_ids = frozenset(trl) | |
| to_suppress = [] | |
| for node in tree.postorder_node_iter(): | |
| if node.taxon: | |
| m = id_extractor.match(node.taxon.label) | |
| if not m: | |
| sys.exit('name not matching pattern: {}'.format(node.taxon.label)) | |
| id_for_node = m.group(1) | |
| if id_for_node not in to_retain_ids: | |
| sys.stderr.write('flagging for suppression due to non-retained ID: {}\n'.format(node.taxon.label)) | |
| to_suppress.append(node) | |
| else: | |
| assert(node.num_child_nodes() != 0) | |
| to_suppress.append(node) | |
| for node in to_suppress: | |
| if node.edge: | |
| node.edge.collapse() | |
| tree.write_to_stream(sys.stdout, schema="newick") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment