Last active
December 17, 2023 22:43
-
-
Save fedarko/0af5d41193abfbec168ce5e643fdc9d5 to your computer and use it in GitHub Desktop.
Compute simple statistics (number of reads, total read length, average read length) for a set of (maybe gzipped) FASTA / FASTQ files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# | |
# Computes the total number of reads, total read length, and average read | |
# length of a set of (maybe gzipped) FASTA / FASTQ files. Requires the pyfastx | |
# library (https://github.com/lmdu/pyfastx). I designed this in the context of | |
# computing read statistics, but if you have a set of other sequences (e.g. | |
# contigs) then I guess this would still work for that. | |
# | |
# USAGE: | |
# ./read_stats.py file1.fa [file2.fa ...] | |
# | |
# Each input file's suffix should match one of the suffixes in FASTA_SUFFIXES | |
# or FASTQ_SUFFIXES defined below, optionally with a ".gz" added to the end | |
# (e.g. "file1.fastq.gz" is fine). Also, you can use globbing (e.g. "*.fa") and | |
# that should work also. | |
import sys | |
import time | |
import glob | |
import pyfastx | |
FASTA_SUFFIXES = ["fa", "fasta", "fna"] | |
FASTQ_SUFFIXES = ["fq", "fastq"] | |
t0 = time.time() | |
def _log(msg): | |
print(f"{time.time() - t0:,.2f}s. {msg}") | |
_log("Starting...") | |
if len(sys.argv) > 1: | |
ir_paths = sys.argv[1:] | |
else: | |
raise ValueError("We need at least 1 arg (FASTA/FASTQ filepath)") | |
num_files_seen = 0 | |
total_num_reads = 0 | |
total_read_length = 0 | |
# Go through each general path -- this could be a single file, or a reference | |
# to multiple files (e.g. "myfiles/*.fastq.gz", which could correspond to | |
# arbitrarily many files) | |
for irp in ir_paths: | |
# Use glob.glob() to convert these to lists of real file paths. | |
# This is of course vulnerable to a race condition if some antagonist | |
# deletes a file in between calling glob.glob() here and when we attempt to | |
# parse it below. But that should trigger an error, and ... look, this is a | |
# script for counting read lengths for a single paper. It should be fine. | |
for irf in glob.glob(irp): | |
tf0 = time.time() | |
num_files_seen += 1 | |
print("-" * 79) | |
_log(f"On file #{num_files_seen:,} ({irf})...") | |
read_kwargs = {} | |
irf_lower = irf.lower() | |
fconst = None | |
for s in FASTA_SUFFIXES: | |
if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"): | |
fconst = pyfastx.Fasta | |
break | |
if fconst is None: | |
for s in FASTQ_SUFFIXES: | |
if irf_lower.endswith(s) or irf_lower.endswith(s + ".gz"): | |
fconst = pyfastx.Fastq | |
break | |
if fconst is None: | |
raise ValueError(f"Not a FASTA/FASTQ file?: {irf}") | |
f = fconst(irf) | |
file_num_reads = len(f) | |
file_read_length = f.size | |
tf1 = time.time() | |
_log( | |
f"Done processing file #{num_files_seen}; took {tf1 - tf0:,.2f}s." | |
) | |
_log( | |
f"File had {file_num_reads:,} reads of total length " | |
f"{file_read_length:,}." | |
) | |
total_num_reads += file_num_reads | |
total_read_length += file_read_length | |
avg_read_length = total_read_length / total_num_reads | |
t1 = time.time() | |
print("=" * 79) | |
_log(f"Done: processed {num_files_seen:,} files.") | |
_log(f"Total time taken: {t1 - t0:,.2f} sec.") | |
_log(f"Total number of reads: {total_num_reads:,}") | |
_log(f"Total read length: {total_read_length:,}") | |
_log(f"Average read length: {avg_read_length:,}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment