Last active
December 15, 2015 04:59
-
-
Save gregcaporaso/5206006 to your computer and use it in GitHub Desktop.
A quick script to generate a table of samples derived by subsampling each sample of a larger BIOM table c times to n sequences per sample.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 20 Mar 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2011, The QIIME project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.6.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from numpy import array | |
from biom.parse import parse_biom_table, table_factory | |
from qiime.util import parse_command_line_parameters, make_option | |
from cogent.maths.stats.rarefaction import subsample_random | |
script_info = {} | |
script_info['brief_description'] = "A hack to illustrate how to generate a table of samples derived by subsampling each sample of a larger BIOM table c times to n sequences per sample." | |
script_info['script_description'] = "" | |
script_info['script_usage'] = [("","Subsample each sample in o.biom to exactly 4 sequences per sample twice.","%prog -i o.biom -o o.ss.biom -c 2 -n 4")] | |
script_info['output_description']= "" | |
script_info['required_options'] = [ | |
make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'), | |
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'), | |
make_option('-n','--num_output_seqs_per_sample',type="int",help='number of output sequences per sample'), | |
make_option('-c','--num_output_samples',type="int",help='number of output samples'), | |
] | |
script_info['optional_options'] = [] | |
script_info['version'] = __version__ | |
def subsampled_replicates(input_t,num_output_seqs_per_sample,num_output_samples): | |
oids = [] | |
osv = [] | |
osmds = [] | |
for sv, si, smd in input_t.iterSamples(): | |
for i in range(num_output_samples): | |
oids.append('%s.%d' % (si,i)) | |
osv.append(subsample_random(sv,num_output_seqs_per_sample)) | |
osmds.append(smd) | |
ot = table_factory(array(osv).T, | |
oids, | |
input_t.ObservationIds, | |
osmds, | |
input_t.ObservationMetadata) | |
return ot | |
def main(): | |
option_parser, opts, args =\ | |
parse_command_line_parameters(**script_info) | |
it = parse_biom_table(open(opts.input_fp,'U')) | |
ot = subsampled_replicates(it,opts.num_output_seqs_per_sample,opts.num_output_samples) | |
open(opts.output_fp,'w').write(ot.getBiomFormatJsonString('https://gist.github.com/gregcaporaso/5206006')) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#OTU ID s1 s2 s3 | |
1 1 5 0 | |
2 0 0 3 | |
3 2 1 4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-03-20T12:01:13.993277","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 3],"data": [[0,0,1.0],[0,1,5.0],[1,2,3.0],[2,0,2.0],[2,1,1.0],[2,2,4.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1", "metadata": null},{"id": "s2", "metadata": null},{"id": "s3", "metadata": null}]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "https://gist.github.com/gregcaporaso/5206006","date": "2013-03-21T17:02:04.751357","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 6],"data": [[0,0,1.0],[0,1,1.0],[0,2,3.0],[0,3,4.0],[1,4,2.0],[1,5,1.0],[2,0,2.0],[2,1,2.0],[2,2,1.0],[2,4,2.0],[2,5,3.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1.0", "metadata": null},{"id": "s1.1", "metadata": null},{"id": "s2.0", "metadata": null},{"id": "s2.1", "metadata": null},{"id": "s3.0", "metadata": null},{"id": "s3.1", "metadata": null}]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 20 Mar 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2011, The QIIME project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.6.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from shutil import rmtree | |
from os.path import exists, join | |
from biom.parse import parse_biom_table | |
from cogent.util.unit_test import TestCase, main | |
from cogent.util.misc import remove_files, create_dir | |
from qiime.util import (get_qiime_temp_dir, | |
get_tmp_filename) | |
from qiime.test import initiate_timeout, disable_timeout | |
from biom_subsampled_replicates import subsampled_replicates | |
class BiomSubsampledReplicatesTests(TestCase): | |
def setUp(self): | |
self.files_to_remove = [] | |
self.dirs_to_remove = [] | |
self.t = parse_biom_table(biom_str.split('\n')) | |
def test_subsampled_replicates(self): | |
actual = subsampled_replicates(self.t,3,2) | |
# correct output sample ids | |
self.assertEqualItems(actual.SampleIds, | |
['s1.0','s1.1','s2.0','s2.1','s3.0','s3.1']) | |
for d, sid, md in actual.iterSamples(): | |
# for each output sample, confirm that each sample has a count | |
# of three observations | |
self.assertEqual(d.sum(),3) | |
orig_sid = sid.split('.')[0] | |
# for each count, confirm that it's <= the original count | |
for i,e in enumerate(d): | |
self.assertTrue(e <= self.t.sampleData(orig_sid)[i]) | |
# when seqs/sample is greater than what is present, the original | |
# vector is returned | |
actual = subsampled_replicates(self.t,4,2) | |
self.assertEqual(actual.sampleData('s1.0'),self.t.sampleData('s1')) | |
self.assertEqual(actual.sampleData('s1.1'),self.t.sampleData('s1')) | |
self.assertEqual(actual.sampleData('s2.0').sum(),4) | |
self.assertEqual(actual.sampleData('s2.1').sum(),4) | |
self.assertEqual(actual.sampleData('s3.0').sum(),4) | |
self.assertEqual(actual.sampleData('s3.1').sum(),4) | |
biom_str = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-03-20T12:01:13.993277","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 3],"data": [[0,0,1.0],[0,1,5.0],[1,2,3.0],[2,0,2.0],[2,1,1.0],[2,2,4.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1", "metadata": null},{"id": "s2", "metadata": null},{"id": "s3", "metadata": null}]} | |
""" | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment