Created
January 2, 2013 15:33
-
-
Save gregcaporaso/4435371 to your computer and use it in GitHub Desktop.
Given an input sequence file, splits sequences randomly into n different files. This is useful for generating files that can be used to test computationally expensive analysis processes as analyses can be run iteratively on each input sequence set as the process can then be run iteratively, but also provide preliminary results based on random su…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 02 Jan 2013 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2011, The QIIME project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.6.0" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Release" | |
from os.path import split, splitext, join | |
from random import choice | |
from cogent.parse.fasta import MinimalFastaParser | |
from qiime.util import (parse_command_line_parameters, | |
make_option, | |
qiime_open, | |
create_dir) | |
script_info = {} | |
script_info['brief_description'] = "Given an input sequence file, splits sequences randomly into n different files. This is useful for generating files that can be used to test computationally expensive analysis processes as analyses can be run iteratively on each input sequence set as the process can then be run iteratively, but also provide preliminary results based on random subsets of the data as the analysis progresses." | |
script_info['script_description'] = "" | |
script_info['script_usage'] = [("","Split seqs.fna into 20 different files and write each to a new file in partitioned_seqs/.","%prog -i seqs.fna -o partitioned_seqs/ -n 20")] | |
script_info['output_description']= "" | |
script_info['required_options'] = [\ | |
# Example required option | |
make_option('-i','--input_fp',type="existing_filepath",help='the input sequence file'), | |
make_option('-o','--output_dir',type="new_dirpath",help='the output sequence directory'), | |
make_option('-n','--n',type="int", | |
help='the number of roughly-equal-sized output files to generate'), | |
] | |
script_info['optional_options'] = [] | |
script_info['version'] = __version__ | |
def partition_seqs(input_seqs,output_fs): | |
for r in input_seqs: | |
choice(output_fs).write('>%s\n%s\n' % r) | |
def main(): | |
option_parser, opts, args =\ | |
parse_command_line_parameters(**script_info) | |
input_basename, input_ext = splitext(split(opts.input_fp)[1]) | |
create_dir(opts.output_dir) | |
output_fp_template = join(opts.output_dir,input_basename+'_%d'+input_ext) | |
output_fs = [] | |
for i in range(opts.n): | |
output_fp = output_fp_template % (i+1) | |
output_fs.append(open(output_fp,'w')) | |
partition_seqs(MinimalFastaParser(qiime_open(opts.input_fp)),output_fs) | |
for output_f in output_fs: | |
output_f.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment