Skip to content

Instantly share code, notes, and snippets.

@fedarko
Last active April 1, 2025 22:20
Show Gist options
  • Save fedarko/efd6ccb83fbe261add58899b13225018 to your computer and use it in GitHub Desktop.
Save fedarko/efd6ccb83fbe261add58899b13225018 to your computer and use it in GitHub Desktop.
Replace degenerate nucleotides in a FASTA file with random nucleotides (respecting IUPAC codes)
#! /usr/bin/env python3
#
# You can run this script as follows:
# $ ./rmdegen [input FASTA] [output FASTA]
import sys
import time
import random
import pyfastx
from collections import Counter
random.seed(333)
assert len(sys.argv) == 3, "Run this as rmdegen [input FASTA] [output FASTA]"
INFILE = sys.argv[1]
OUTFILE = sys.argv[2]
def get_random_nt(degen):
# based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025
if degen == "N":
# we check N first because it is the most common of these in practice
return random.choice("ACGT")
elif degen == "R":
return random.choice("AG")
elif degen == "Y":
return random.choice("CT")
elif degen == "S":
return random.choice("GC")
elif degen == "W":
return random.choice("AT")
elif degen == "K":
return random.choice("GT")
elif degen == "M":
return random.choice("AC")
elif degen == "B":
return random.choice("CGT")
elif degen == "D":
return random.choice("AGT")
elif degen == "H":
return random.choice("ACT")
elif degen == "V":
return random.choice("ACG")
else:
raise ValueError(f"Character {degen} not in IUPAC code?")
def pct_str(n, d):
pct = 100 * (n / d)
return f"{n:,} / {d:,} ({pct:.2f}%)"
prefix = "-" * 79
# Based on https://stackoverflow.com/a/34778558 -- we want to append each
# randomized sequence to the file
t0 = time.time()
with open(OUTFILE, "w") as out_fh:
print(f"Loading and (if needed) indexing {INFILE}...")
indexed_fasta = pyfastx.Fasta(INFILE)
num_seqs = len(indexed_fasta.keys())
print(f"Contains {num_seqs:,} sequence(s).")
for si, seq_name in enumerate(indexed_fasta.keys(), 1):
seq_len = len(indexed_fasta[seq_name])
desc = indexed_fasta[seq_name].description
print(
f'{prefix}\nOn seq {pct_str(si, num_seqs)}: "{desc}" '
f"({seq_len:,} nt)..."
)
st0 = time.time()
degen_ctr = Counter()
nd = 0
out_fh.write(f">{desc}_n2r\n")
in_seq = str(indexed_fasta[seq_name]).upper()
out_seq = ""
for ci, c in enumerate(in_seq, 1):
if ci % 1000000 == 0:
# ci is 1-indexed
print(f" On nt {pct_str(ci, seq_len)}...")
if c in "ACGT":
out_seq += c
else:
degen_ctr[c] += 1
nd += 1
out_seq += get_random_nt(c)
out_fh.write(f"{out_seq}\n")
if len(degen_ctr) > 0:
suffix = f"Stats: {degen_ctr}"
adj = "new"
else:
suffix = "No replacements done."
adj = "old"
print(
f"{pct_str(nd, seq_len)} of the nts in this seq were degenerate. "
f"{suffix}"
)
print(f"Wrote out the {adj} sequence to {OUTFILE}.")
st1 = time.time()
print(f'Processing "{desc}" took {st1 - st0:,.2f} sec.')
t1 = time.time()
print(f"Took {t1 - t0:,.2f} sec. to process {si:,} seq(s) in {INFILE}.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment