fedarko · December 17, 2023 22:43
diff --git a/read_stats.py b/read_stats.py
 #! /usr/bin/env python3
 #
 # Computes the total number of reads, total read length, and average read
 # length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
 # library (https://github.com/lmdu/pyfastx). I designed this in the context of
 # computing read statistics, but if you have a set of other sequences (e.g.
 # contigs) then I guess this would still work for that.
 #
 # USAGE:
 # ./read_stats.py file1.fa [file2.fa ...]
 #
 # Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES
 # or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end
 # (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and
 # that should work also.


 import sys
 import time
 import glob
 import pyfastx


 FASTA_SUFFIXES = ["fa", "fasta", "fna"]
 FASTQ_SUFFIXES = ["fq", "fastq"]


 t0 = time.time()


 def _log(msg):
    print(f"{time.time() - t0:,.2f}s. {msg}")


 _log("Starting...")
 if len(sys.argv) > 1:
    ir_paths = sys.argv[1:]
 else:
    raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)")

 num_files_seen = 0
 total_num_reads = 0
 total_read_length = 0

 # Go through each general path -- this could be a single file, or a reference
 # to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to
 # arbitrarily many files)
 for irp in ir_paths:
    # Use glob.glob() to convert these to lists of real file paths.
    # This is of course vulnerable to a race condition if some antagonist
    # deletes a file in between calling glob.glob() here and when we attempt to
    # parse it below. But that should trigger an error, and ... look, this is a
    # script for counting read lengths for a single paper. It should be fine.
    for irf in glob.glob(irp):
        tf0 = time.time()
        num_files_seen += 1
        print("-" * 79)
        _log(f"On file #{num_files_seen:,} ({irf})...")

        read_kwargs = {}
        irf_lower = irf.lower()
        fconst = None
        for s in FASTA_SUFFIXES:
            if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
                fconst = pyfastx.Fasta
                break

        if fconst is None:
            for s in FASTQ_SUFFIXES:
                if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
                    fconst = pyfastx.Fastq
                    break

            if fconst is None:
                raise ValueError(f"Not a FASTA/FASTQ file?: {irf}")

        f = fconst(irf)
        file_num_reads = len(f)
        file_read_length = f.size

        tf1 = time.time()
        _log(
            f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s."
        )
        _log(
            f"File had {file_num_reads:,} reads of total length "
            f"{file_read_length:,}."
        )
        total_num_reads += file_num_reads
        total_read_length += file_read_length

 avg_read_length = total_read_length / total_num_reads

 t1 = time.time()
 print("=" * 79)
 _log(f"Done: processed {num_files_seen:,} files.")
 _log(f"Total time taken: {t1 - t0:,.2f} sec.")
 _log(f"Total number of reads: {total_num_reads:,}")
 _log(f"Total read length: {total_read_length:,}")
 _log(f"Average read length: {avg_read_length:,}")
	#! /usr/bin/env python3
	#
	# Computes the total number of reads, total read length, and average read
	# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx
	# library (https://github.com/lmdu/pyfastx). I designed this in the context of
	# computing read statistics, but if you have a set of other sequences (e.g.
	# contigs) then I guess this would still work for that.
	#
	# USAGE:
	# ./read_stats.py file1.fa [file2.fa ...]
	#
	# Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES
	# or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end
	# (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and
	# that should work also.


	import sys
	import time
	import glob
	import pyfastx


	FASTA_SUFFIXES = ["fa", "fasta", "fna"]
	FASTQ_SUFFIXES = ["fq", "fastq"]


	t0 = time.time()


	def _log(msg):
	print(f"{time.time() - t0:,.2f}s. {msg}")


	_log("Starting...")
	if len(sys.argv) > 1:
	ir_paths = sys.argv[1:]
	else:
	raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)")

	num_files_seen = 0
	total_num_reads = 0
	total_read_length = 0

	# Go through each general path -- this could be a single file, or a reference
	# to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to
	# arbitrarily many files)
	for irp in ir_paths:
	# Use glob.glob() to convert these to lists of real file paths.
	# This is of course vulnerable to a race condition if some antagonist
	# deletes a file in between calling glob.glob() here and when we attempt to
	# parse it below. But that should trigger an error, and ... look, this is a
	# script for counting read lengths for a single paper. It should be fine.
	for irf in glob.glob(irp):
	tf0 = time.time()
	num_files_seen += 1
	print("-" * 79)
	_log(f"On file #{num_files_seen:,} ({irf})...")

	read_kwargs = {}
	irf_lower = irf.lower()
	fconst = None
	for s in FASTA_SUFFIXES:
	if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
	fconst = pyfastx.Fasta
	break

	if fconst is None:
	for s in FASTQ_SUFFIXES:
	if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"):
	fconst = pyfastx.Fastq
	break

	if fconst is None:
	raise ValueError(f"Not a FASTA/FASTQ file?: {irf}")

	f = fconst(irf)
	file_num_reads = len(f)
	file_read_length = f.size

	tf1 = time.time()
	_log(
	f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s."
	)
	_log(
	f"File had {file_num_reads:,} reads of total length "
	f"{file_read_length:,}."
	)
	total_num_reads += file_num_reads
	total_read_length += file_read_length

	avg_read_length = total_read_length / total_num_reads

	t1 = time.time()
	print("=" * 79)
	_log(f"Done: processed {num_files_seen:,} files.")
	_log(f"Total time taken: {t1 - t0:,.2f} sec.")
	_log(f"Total number of reads: {total_num_reads:,}")
	_log(f"Total read length: {total_read_length:,}")
	_log(f"Average read length: {avg_read_length:,}")