walterst · December 17, 2018 16:08
diff --git a/random_subsample_fastq.py b/random_subsample_fastq.py
 #!/usr/bin/env

 from sys import argv
 from random import random

 #from gzip import open as gz_open
 from glob import glob

 import gzip
 import os

 from os.path import abspath, basename, exists, dirname, join, splitext, isfile


 from skbio.util import remove_files, create_dir
 from cogent.parse.fastq import MinimalFastqParser

 threshold = 0.03 # random value between 0 and 1 to retain read, using 3% now
 input_dir = argv[1]
 output_dir = argv[2]

 create_dir(output_dir)

 fastq_files = []

 fastq_files = glob(input_dir + "/*.fastq.gz")
        


 for curr_file in fastq_files:

    if curr_file.endswith('.gz'):
        query_reads = gzip.open(curr_file, "rb")
    else:
        query_reads = open(curr_file, "U")
        
    curr_outfile = join(output_dir, basename(curr_file).replace(".fastq.gz", ".fastq"))
    

    out_f = open(curr_outfile, "w")
    for read_data in MinimalFastqParser(query_reads, strict=False):
        if random() < threshold:
            curr_read = "@%s\n" % read_data[0]
            curr_read += "%s\n" % read_data[1]
            curr_read += "+\n"
            curr_read += "%s\n" % read_data[2]
            out_f.write(curr_read)
            
    query_reads.close()
    out_f.close()
	#!/usr/bin/env

	from sys import argv
	from random import random

	#from gzip import open as gz_open
	from glob import glob

	import gzip
	import os

	from os.path import abspath, basename, exists, dirname, join, splitext, isfile


	from skbio.util import remove_files, create_dir
	from cogent.parse.fastq import MinimalFastqParser

	threshold = 0.03 # random value between 0 and 1 to retain read, using 3% now
	input_dir = argv[1]
	output_dir = argv[2]

	create_dir(output_dir)

	fastq_files = []

	fastq_files = glob(input_dir + "/*.fastq.gz")



	for curr_file in fastq_files:

	if curr_file.endswith('.gz'):
	query_reads = gzip.open(curr_file, "rb")
	else:
	query_reads = open(curr_file, "U")

	curr_outfile = join(output_dir, basename(curr_file).replace(".fastq.gz", ".fastq"))


	out_f = open(curr_outfile, "w")
	for read_data in MinimalFastqParser(query_reads, strict=False):
	if random() < threshold:
	curr_read = "@%s\n" % read_data[0]
	curr_read += "%s\n" % read_data[1]
	curr_read += "+\n"
	curr_read += "%s\n" % read_data[2]
	out_f.write(curr_read)

	query_reads.close()
	out_f.close()