fedarko · April 1, 2025 22:20
diff --git a/rmdegen.py b/rmdegen.py
 #! /usr/bin/env python3
 #
 # You can run this script as follows:
 # $ ./rmdegen [input FASTA] [output FASTA]

 import sys
 import time
 import random
 import pyfastx
 from collections import Counter

 random.seed(333)

 assert len(sys.argv) == 3, "Run this as rmdegen [input FASTA] [output FASTA]"

 INFILE = sys.argv[1]
 OUTFILE = sys.argv[2]


 def get_random_nt(degen):
    # based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025
    if degen == "N":
        # we check N first because it is the most common of these in practice
        return random.choice("ACGT")
    elif degen == "R":
        return random.choice("AG")
    elif degen == "Y":
        return random.choice("CT")
    elif degen == "S":
        return random.choice("GC")
    elif degen == "W":
        return random.choice("AT")
    elif degen == "K":
        return random.choice("GT")
    elif degen == "M":
        return random.choice("AC")
    elif degen == "B":
        return random.choice("CGT")
    elif degen == "D":
        return random.choice("AGT")
    elif degen == "H":
        return random.choice("ACT")
    elif degen == "V":
        return random.choice("ACG")
    else:
        raise ValueError(f"Character {degen} not in IUPAC code?")


 def pct_str(n, d):
    pct = 100 * (n / d)
    return f"{n:,} / {d:,} ({pct:.2f}%)"


 prefix = "-" * 79
 # Based on https://stackoverflow.com/a/34778558 -- we want to append each
 # randomized sequence to the file
 t0 = time.time()
 with open(OUTFILE, "w") as out_fh:
    print(f"Loading and (if needed) indexing {INFILE}...")
    indexed_fasta = pyfastx.Fasta(INFILE)
    num_seqs = len(indexed_fasta.keys())
    print(f"Contains {num_seqs:,} sequence(s).")
    for si, seq_name in enumerate(indexed_fasta.keys(), 1):
        seq_len = len(indexed_fasta[seq_name])
        desc = indexed_fasta[seq_name].description
        print(
            f'{prefix}\nOn seq {pct_str(si, num_seqs)}: "{desc}" '
            f"({seq_len:,} nt)..."
        )
        st0 = time.time()
        degen_ctr = Counter()
        nd = 0
        out_fh.write(f">{desc}_n2r\n")
        in_seq = str(indexed_fasta[seq_name]).upper()
        out_seq = ""
        for ci, c in enumerate(in_seq, 1):
            if ci % 1000000 == 0:
                # ci is 1-indexed
                print(f"  On nt {pct_str(ci, seq_len)}...")
            if c in "ACGT":
                out_seq += c
            else:
                degen_ctr[c] += 1
                nd += 1
                out_seq += get_random_nt(c)
        out_fh.write(f"{out_seq}\n")
        if len(degen_ctr) > 0:
            suffix = f"Stats: {degen_ctr}"
            adj = "new"
        else:
            suffix = "No replacements done."
            adj = "old"
        print(
            f"{pct_str(nd, seq_len)} of the nts in this seq were degenerate. "
            f"{suffix}"
        )
        print(f"Wrote out the {adj} sequence to {OUTFILE}.")
        st1 = time.time()
        print(f'Processing "{desc}" took {st1 - st0:,.2f} sec.')
 t1 = time.time()
 print(f"Took {t1 - t0:,.2f} sec. to process {si:,} seq(s) in {INFILE}.")
	#! /usr/bin/env python3
	#
	# You can run this script as follows:
	# $ ./rmdegen [input FASTA] [output FASTA]

	import sys
	import time
	import random
	import pyfastx
	from collections import Counter

	random.seed(333)

	assert len(sys.argv) == 3, "Run this as rmdegen [input FASTA] [output FASTA]"

	INFILE = sys.argv[1]
	OUTFILE = sys.argv[2]


	def get_random_nt(degen):
	# based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025
	if degen == "N":
	# we check N first because it is the most common of these in practice
	return random.choice("ACGT")
	elif degen == "R":
	return random.choice("AG")
	elif degen == "Y":
	return random.choice("CT")
	elif degen == "S":
	return random.choice("GC")
	elif degen == "W":
	return random.choice("AT")
	elif degen == "K":
	return random.choice("GT")
	elif degen == "M":
	return random.choice("AC")
	elif degen == "B":
	return random.choice("CGT")
	elif degen == "D":
	return random.choice("AGT")
	elif degen == "H":
	return random.choice("ACT")
	elif degen == "V":
	return random.choice("ACG")
	else:
	raise ValueError(f"Character {degen} not in IUPAC code?")


	def pct_str(n, d):
	pct = 100 * (n / d)
	return f"{n:,} / {d:,} ({pct:.2f}%)"


	prefix = "-" * 79
	# Based on https://stackoverflow.com/a/34778558 -- we want to append each
	# randomized sequence to the file
	t0 = time.time()
	with open(OUTFILE, "w") as out_fh:
	print(f"Loading and (if needed) indexing {INFILE}...")
	indexed_fasta = pyfastx.Fasta(INFILE)
	num_seqs = len(indexed_fasta.keys())
	print(f"Contains {num_seqs:,} sequence(s).")
	for si, seq_name in enumerate(indexed_fasta.keys(), 1):
	seq_len = len(indexed_fasta[seq_name])
	desc = indexed_fasta[seq_name].description
	print(
	f'{prefix}\nOn seq {pct_str(si, num_seqs)}: "{desc}" '
	f"({seq_len:,} nt)..."
	)
	st0 = time.time()
	degen_ctr = Counter()
	nd = 0
	out_fh.write(f">{desc}_n2r\n")
	in_seq = str(indexed_fasta[seq_name]).upper()
	out_seq = ""
	for ci, c in enumerate(in_seq, 1):
	if ci % 1000000 == 0:
	# ci is 1-indexed
	print(f" On nt {pct_str(ci, seq_len)}...")
	if c in "ACGT":
	out_seq += c
	else:
	degen_ctr[c] += 1
	nd += 1
	out_seq += get_random_nt(c)
	out_fh.write(f"{out_seq}\n")
	if len(degen_ctr) > 0:
	suffix = f"Stats: {degen_ctr}"
	adj = "new"
	else:
	suffix = "No replacements done."
	adj = "old"
	print(
	f"{pct_str(nd, seq_len)} of the nts in this seq were degenerate. "
	f"{suffix}"
	)
	print(f"Wrote out the {adj} sequence to {OUTFILE}.")
	st1 = time.time()
	print(f'Processing "{desc}" took {st1 - st0:,.2f} sec.')
	t1 = time.time()
	print(f"Took {t1 - t0:,.2f} sec. to process {si:,} seq(s) in {INFILE}.")