Last active
December 11, 2019 10:32
-
-
Save adamrp/7591573 to your computer and use it in GitHub Desktop.
Given a biom table, iterate over the SampleData. Set observations that represent less than a certain fraction of the sample's total abundance to zero.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from argparse import ArgumentParser | |
from numpy import array | |
from biom import load_table, Table | |
__author__ = "Adam Robbins-Pianka" | |
__copyright__ = "Copyright 2013" | |
__credits__ = ["Adam Robbins-Pianka"] | |
__license__ = "BSD" | |
__version__ = "unversioned" | |
__maintainer__ = "Adam Robbins-Pianka" | |
__email__ = "[email protected]" | |
parser = ArgumentParser() | |
parser.add_argument('-i', '--input_biom', help='The path to the input biom ' | |
'file to be filtered.', required=True) | |
parser.add_argument('-n', '--abundance_threshold', help='The minimum ' | |
'abundance of an OTU in a sample in order to be retained.', type=float, | |
required=True) | |
parser.add_argument('-o', '--output_biom', help='The path to the output file.', | |
required=True) | |
parser.add_argument('-f', '--abundance_as_fraction', help='Treat the value ' | |
'passed for -n (--abundance_threshold) as a fraction rather than an ' | |
'absolute count.', action='store_true', required=False, default=False) | |
def main(): | |
args = parser.parse_args() | |
input_fp = args.input_biom | |
output_fp = args.output_biom | |
threshold = args.abundance_threshold | |
as_fraction = args.abundance_as_fraction | |
if as_fraction: | |
if not 0 <= threshold <= 1: | |
raise ValueError("The value passed for -n " | |
"(--abundance_as_fraction) must be in the " | |
"interval [0, 1]") | |
if not as_fraction: | |
if not str(threshold).replace('.','',1).isdigit(): | |
raise ValueError("If you want to express the minimum threshold as " | |
"a fraction of the total sequences in a sample, " | |
"use -n in combination with -f. Otherwise, if " | |
"you want to express the minimum threshold as an " | |
"absolute sequence count minimum, the value " | |
"passed for -n must be an integer.") | |
threshold = int(threshold) | |
input_table = load_table(input_fp) | |
new_data = [] | |
append_new_data = new_data.append | |
for abundances in input_table.iter_data(): | |
if as_fraction: | |
abundance_fractions = abundances.astype(float)/sum(abundances) | |
indices = [i for (i, j) in enumerate(abundance_fractions>threshold) | |
if not j] | |
else: | |
indices = [i for (i, j) in enumerate(abundances>threshold) | |
if not j] | |
item_set = abundances.itemset | |
for index in indices: | |
item_set(index, 0) | |
append_new_data(abundances) | |
new_data = array(new_data).transpose() | |
new_table = Table(new_data, | |
input_table.ids('observation'), | |
input_table.ids(), | |
input_table.metadata(axis='observation'), | |
input_table.metadata()) | |
with open(output_fp, 'w') as output_fd: | |
new_table.to_json('one-time generation', output_fd) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment