Created
December 28, 2019 21:47
-
-
Save mtholder/85ca405244c73ee4b68261e97d74d756 to your computer and use it in GitHub Desktop.
executed from the subproblems directory of an Open Tree synth tree to create subproblem_size_summary.json
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/env python | |
| import json | |
| import sys | |
| import subprocess | |
| import itertools | |
| DEBUG = False | |
| def count_tips_and_inf_splits(fn): | |
| out = subprocess.check_output(['otc-degree-distribution', fn], stderr=subprocess.DEVNULL).decode('utf-8') | |
| out = '{}Out-degree\n'.format(out) | |
| all_pairs = [] | |
| cur_pair = None | |
| for line in out.split('\n'): | |
| if not line.strip(): | |
| continue | |
| if line.startswith('Out-degree'): | |
| if cur_pair: | |
| all_pairs.append(cur_pair) | |
| cur_pair = [0, 0] | |
| continue | |
| out_degree, count = [int(i) for i in line.strip().split()] | |
| if out_degree == 0: | |
| cur_pair[0] = count | |
| elif out_degree > 1: | |
| assert cur_pair[0] > 0 | |
| if out_degree < cur_pair[0]: | |
| cur_pair[1] = cur_pair[1] + count | |
| if DEBUG: | |
| sys.stderr.write(out) | |
| else: | |
| sys.stderr.write(' analyzed {} ...\n'.format(fn)) | |
| return all_pairs | |
| inpjson = 'index.json' | |
| with open(inpjson, 'r', encoding='utf-8') as inp: | |
| index_blob = json.load(inp) | |
| with open('num_tips_for_ott_internals_in_labelled_tree.json', 'r', encoding='utf-8') as inp: | |
| nsynth_tip_blob = json.load(inp) | |
| tree_list = [] | |
| tree_to_index = {} | |
| sub_obj = {} | |
| for blob in index_blob["subproblems"]["sorted_by_num_phylo_inputs"]: | |
| subprob_tree_file, inp_tree_list, num_leaves = blob | |
| assert(subprob_tree_file.startswith("ott")) | |
| assert(subprob_tree_file.endswith(".tre")) | |
| subprob_id = subprob_tree_file[:-4] | |
| assert(subprob_id not in sub_obj) | |
| tree_ind_list = [] | |
| taxonomy_found = False | |
| for tree_id in inp_tree_list: | |
| assert(not taxonomy_found) | |
| if tree_id.lower() == 'taxonomy': | |
| taxonomy_found = True | |
| continue | |
| inp_tree_ind = tree_to_index.get(tree_id) | |
| if inp_tree_ind is None: | |
| inp_tree_ind = len(tree_list) | |
| tree_to_index[tree_id] = inp_tree_ind | |
| tree_list.append(tree_id) | |
| tree_ind_list.append(inp_tree_ind) | |
| assert(taxonomy_found) | |
| curr = {'num_leaves': num_leaves, | |
| 'num_synth_leaves': nsynth_tip_blob[subprob_id], | |
| 'tree_indices': tree_ind_list, | |
| 'tree_tips_and_inf_splits': count_tips_and_inf_splits(subprob_tree_file) | |
| } | |
| sub_obj[subprob_id] = curr | |
| if DEBUG: | |
| break | |
| taxonomy_index = len(tree_list) | |
| tree_list.append('TAXONOMY') | |
| compressed_obj = {} | |
| for key, value in sub_obj.items(): | |
| nl = value['num_leaves'] | |
| nsl = value['num_synth_leaves'] | |
| til = value['tree_indices'] | |
| ttais = value['tree_tips_and_inf_splits'] | |
| til.append(taxonomy_index) | |
| assert(len(til) == len(ttais)) | |
| concat = [] | |
| for tree_index, nt_inf_sp_pair in itertools.zip_longest(til, ttais): | |
| num_tips, num_splits = nt_inf_sp_pair | |
| concat.append([num_tips, num_splits, tree_index]) | |
| compressed_obj[key] = [nl, nsl, concat] | |
| out = {"subproblems": compressed_obj, | |
| "tree_ids": tree_list} | |
| jkwargs ={'separators':(',', ':')} | |
| if DEBUG: | |
| jkwargs["indent"] = 2 | |
| print(json.dumps(out, **jkwargs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment