Last active
August 15, 2016 17:37
-
-
Save wasade/60db4059e3b7e42bb648db1d10ef72d6 to your computer and use it in GitHub Desktop.
Block decomposition method for unifrac + rapid subsetting of biom direct from disk + joblib + balanced parentheses tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import bp | |
import biom | |
import skbio | |
import sys | |
import joblib | |
import h5py | |
class biom_counts_wrapper(object): | |
def __init__(self, table_f): | |
self.table_f = table_f | |
with h5py.File(self.table_f, 'r') as table_h5grp: | |
self.ids = np.asarray([i.decode('ascii') for i in table_h5grp['sample/ids'][:]]) | |
self.otu_ids = np.asarray([i.decode('ascii') for i in table_h5grp['observation/ids'][:]]) | |
def __getitem__(self, indices_to_take): | |
with h5py.File(self.table_f, 'r') as table_h5grp: | |
table = biom.Table.from_hdf5(table_h5grp, | |
ids=self.ids[indices_to_take], | |
subset_with_metadata=False) | |
# need to wrap as it returns a matrix type | |
return np.array(table.matrix_data.todense().astype(int).T) | |
def __len__(self): | |
return len(self.ids) | |
class monkey: | |
def __init__(self, t_fp): | |
self.fp = t_fp | |
self.bp = None | |
# needed for skbio.diversity._util._vectorize_counts_and_tree | |
def to_array(self, **kwargs): | |
if self.bp is None: | |
raise AttributeError("should not happen") | |
return bp.to_skbio_treearray(self.bp) | |
# punch through | |
def shear(self, tips): | |
if self.bp is None: | |
self.bp = bp.parse_newick(open(self.fp).read()) | |
new_bp = self.bp.shear(set(tips)).collapse() | |
new_obj = self.__class__(self.fp) | |
new_obj.bp = new_bp | |
return new_obj | |
tree_full = monkey(sys.argv[1]) | |
biom_full = biom_counts_wrapper(sys.argv[2]) | |
def mapper(func, kw_gen): | |
with joblib.Parallel(n_jobs=int(sys.argv[3])) as parallel: | |
results = parallel(joblib.delayed(func)(**kw) for kw in kw_gen) | |
return results | |
if __name__ == '__main__': | |
from skbio.diversity._block import block_beta_diversity | |
from skbio.diversity import beta_diversity | |
if sys.argv[4] == 'block': | |
result = block_beta_diversity('unweighted_unifrac', biom_full, | |
ids=biom_full.ids, | |
otu_ids=biom_full.otu_ids, tree=tree_full, | |
validate=False, k=64, map_f=mapper) | |
elif sys.argv[4] == 'normal': | |
result = beta_diversity('unweighted_unifrac', | |
biom_full[np.arange(len(biom_full.ids))], | |
ids=biom_full.ids, | |
otu_ids=biom_full.otu_ids, | |
tree=tree_full.shear(biom_full.otu_ids), | |
validate=False) | |
else: | |
# validate | |
bkres = block_beta_diversity('unweighted_unifrac', biom_full, | |
ids=biom_full.ids, | |
otu_ids=biom_full.otu_ids, tree=tree_full, | |
validate=False, k=64, map_f=mapper) | |
normal = beta_diversity('unweighted_unifrac', | |
biom_full[np.arange(len(biom_full.ids))], | |
ids=biom_full.ids, | |
otu_ids=biom_full.otu_ids, | |
tree=tree_full.shear(biom_full.otu_ids), | |
validate=False) | |
import numpy.testing as npt | |
npt.assert_almost_equal(bkres.data, normal.data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
...likely possible to avoid reparsing the tree on each individual block. but this works right now, and at 4 cores, appears to be 2x faster than skbio by itself when benching on 2048 random AG samples (gg 97%). Compute isn't necessarily impressive, but memory is: 10x reduction so anticipate a (relatively) small foot print even with large trees.
note: k=64 is 64 samples per block. this seems to be the sweet spot from unifrac benching previously where basically, 64 samples seemed to be a good trade off for the amount of time to construct the counts array vs. avg per pairwise calculation. seems to hold here as well, were k=32 and k=128 are both slower. This could easily be dependent on the size of the tree and the richness of the samples, so possibly k=64 is overfit to AG data.