Skip to content

Instantly share code, notes, and snippets.

@mtholder
Created December 28, 2019 21:47
Show Gist options
  • Select an option

  • Save mtholder/85ca405244c73ee4b68261e97d74d756 to your computer and use it in GitHub Desktop.

Select an option

Save mtholder/85ca405244c73ee4b68261e97d74d756 to your computer and use it in GitHub Desktop.
executed from the subproblems directory of an Open Tree synth tree to create subproblem_size_summary.json
#!/bin/env python
import json
import sys
import subprocess
import itertools
DEBUG = False
def count_tips_and_inf_splits(fn):
out = subprocess.check_output(['otc-degree-distribution', fn], stderr=subprocess.DEVNULL).decode('utf-8')
out = '{}Out-degree\n'.format(out)
all_pairs = []
cur_pair = None
for line in out.split('\n'):
if not line.strip():
continue
if line.startswith('Out-degree'):
if cur_pair:
all_pairs.append(cur_pair)
cur_pair = [0, 0]
continue
out_degree, count = [int(i) for i in line.strip().split()]
if out_degree == 0:
cur_pair[0] = count
elif out_degree > 1:
assert cur_pair[0] > 0
if out_degree < cur_pair[0]:
cur_pair[1] = cur_pair[1] + count
if DEBUG:
sys.stderr.write(out)
else:
sys.stderr.write(' analyzed {} ...\n'.format(fn))
return all_pairs
inpjson = 'index.json'
with open(inpjson, 'r', encoding='utf-8') as inp:
index_blob = json.load(inp)
with open('num_tips_for_ott_internals_in_labelled_tree.json', 'r', encoding='utf-8') as inp:
nsynth_tip_blob = json.load(inp)
tree_list = []
tree_to_index = {}
sub_obj = {}
for blob in index_blob["subproblems"]["sorted_by_num_phylo_inputs"]:
subprob_tree_file, inp_tree_list, num_leaves = blob
assert(subprob_tree_file.startswith("ott"))
assert(subprob_tree_file.endswith(".tre"))
subprob_id = subprob_tree_file[:-4]
assert(subprob_id not in sub_obj)
tree_ind_list = []
taxonomy_found = False
for tree_id in inp_tree_list:
assert(not taxonomy_found)
if tree_id.lower() == 'taxonomy':
taxonomy_found = True
continue
inp_tree_ind = tree_to_index.get(tree_id)
if inp_tree_ind is None:
inp_tree_ind = len(tree_list)
tree_to_index[tree_id] = inp_tree_ind
tree_list.append(tree_id)
tree_ind_list.append(inp_tree_ind)
assert(taxonomy_found)
curr = {'num_leaves': num_leaves,
'num_synth_leaves': nsynth_tip_blob[subprob_id],
'tree_indices': tree_ind_list,
'tree_tips_and_inf_splits': count_tips_and_inf_splits(subprob_tree_file)
}
sub_obj[subprob_id] = curr
if DEBUG:
break
taxonomy_index = len(tree_list)
tree_list.append('TAXONOMY')
compressed_obj = {}
for key, value in sub_obj.items():
nl = value['num_leaves']
nsl = value['num_synth_leaves']
til = value['tree_indices']
ttais = value['tree_tips_and_inf_splits']
til.append(taxonomy_index)
assert(len(til) == len(ttais))
concat = []
for tree_index, nt_inf_sp_pair in itertools.zip_longest(til, ttais):
num_tips, num_splits = nt_inf_sp_pair
concat.append([num_tips, num_splits, tree_index])
compressed_obj[key] = [nl, nsl, concat]
out = {"subproblems": compressed_obj,
"tree_ids": tree_list}
jkwargs ={'separators':(',', ':')}
if DEBUG:
jkwargs["indent"] = 2
print(json.dumps(out, **jkwargs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment