Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 15, 2015 04:59
Show Gist options
  • Save gregcaporaso/5206006 to your computer and use it in GitHub Desktop.
Save gregcaporaso/5206006 to your computer and use it in GitHub Desktop.
A quick script to generate a table of samples derived by subsampling each sample of a larger BIOM table c times to n sequences per sample.
#!/usr/bin/env python
# File created on 20 Mar 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.6.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from numpy import array
from biom.parse import parse_biom_table, table_factory
from qiime.util import parse_command_line_parameters, make_option
from cogent.maths.stats.rarefaction import subsample_random
script_info = {}
script_info['brief_description'] = "A hack to illustrate how to generate a table of samples derived by subsampling each sample of a larger BIOM table c times to n sequences per sample."
script_info['script_description'] = ""
script_info['script_usage'] = [("","Subsample each sample in o.biom to exactly 4 sequences per sample twice.","%prog -i o.biom -o o.ss.biom -c 2 -n 4")]
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-i','--input_fp',type="existing_filepath",help='the input filepath'),
make_option('-o','--output_fp',type="new_filepath",help='the output filepath'),
make_option('-n','--num_output_seqs_per_sample',type="int",help='number of output sequences per sample'),
make_option('-c','--num_output_samples',type="int",help='number of output samples'),
]
script_info['optional_options'] = []
script_info['version'] = __version__
def subsampled_replicates(input_t,num_output_seqs_per_sample,num_output_samples):
oids = []
osv = []
osmds = []
for sv, si, smd in input_t.iterSamples():
for i in range(num_output_samples):
oids.append('%s.%d' % (si,i))
osv.append(subsample_random(sv,num_output_seqs_per_sample))
osmds.append(smd)
ot = table_factory(array(osv).T,
oids,
input_t.ObservationIds,
osmds,
input_t.ObservationMetadata)
return ot
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
it = parse_biom_table(open(opts.input_fp,'U'))
ot = subsampled_replicates(it,opts.num_output_seqs_per_sample,opts.num_output_samples)
open(opts.output_fp,'w').write(ot.getBiomFormatJsonString('https://gist.github.com/gregcaporaso/5206006'))
if __name__ == "__main__":
main()
#OTU ID s1 s2 s3
1 1 5 0
2 0 0 3
3 2 1 4
{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-03-20T12:01:13.993277","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 3],"data": [[0,0,1.0],[0,1,5.0],[1,2,3.0],[2,0,2.0],[2,1,1.0],[2,2,4.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1", "metadata": null},{"id": "s2", "metadata": null},{"id": "s3", "metadata": null}]}
{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "https://gist.github.com/gregcaporaso/5206006","date": "2013-03-21T17:02:04.751357","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 6],"data": [[0,0,1.0],[0,1,1.0],[0,2,3.0],[0,3,4.0],[1,4,2.0],[1,5,1.0],[2,0,2.0],[2,1,2.0],[2,2,1.0],[2,4,2.0],[2,5,3.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1.0", "metadata": null},{"id": "s1.1", "metadata": null},{"id": "s2.0", "metadata": null},{"id": "s2.1", "metadata": null},{"id": "s3.0", "metadata": null},{"id": "s3.1", "metadata": null}]}
#!/usr/bin/env python
# File created on 20 Mar 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.6.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from shutil import rmtree
from os.path import exists, join
from biom.parse import parse_biom_table
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files, create_dir
from qiime.util import (get_qiime_temp_dir,
get_tmp_filename)
from qiime.test import initiate_timeout, disable_timeout
from biom_subsampled_replicates import subsampled_replicates
class BiomSubsampledReplicatesTests(TestCase):
def setUp(self):
self.files_to_remove = []
self.dirs_to_remove = []
self.t = parse_biom_table(biom_str.split('\n'))
def test_subsampled_replicates(self):
actual = subsampled_replicates(self.t,3,2)
# correct output sample ids
self.assertEqualItems(actual.SampleIds,
['s1.0','s1.1','s2.0','s2.1','s3.0','s3.1'])
for d, sid, md in actual.iterSamples():
# for each output sample, confirm that each sample has a count
# of three observations
self.assertEqual(d.sum(),3)
orig_sid = sid.split('.')[0]
# for each count, confirm that it's <= the original count
for i,e in enumerate(d):
self.assertTrue(e <= self.t.sampleData(orig_sid)[i])
# when seqs/sample is greater than what is present, the original
# vector is returned
actual = subsampled_replicates(self.t,4,2)
self.assertEqual(actual.sampleData('s1.0'),self.t.sampleData('s1'))
self.assertEqual(actual.sampleData('s1.1'),self.t.sampleData('s1'))
self.assertEqual(actual.sampleData('s2.0').sum(),4)
self.assertEqual(actual.sampleData('s2.1').sum(),4)
self.assertEqual(actual.sampleData('s3.0').sum(),4)
self.assertEqual(actual.sampleData('s3.1').sum(),4)
biom_str = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-03-20T12:01:13.993277","matrix_type": "sparse","matrix_element_type": "float","shape": [3, 3],"data": [[0,0,1.0],[0,1,5.0],[1,2,3.0],[2,0,2.0],[2,1,1.0],[2,2,4.0]],"rows": [{"id": "1", "metadata": null},{"id": "2", "metadata": null},{"id": "3", "metadata": null}],"columns": [{"id": "s1", "metadata": null},{"id": "s2", "metadata": null},{"id": "s3", "metadata": null}]}
"""
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment