Last active
April 1, 2025 22:20
-
-
Save fedarko/efd6ccb83fbe261add58899b13225018 to your computer and use it in GitHub Desktop.
Replace degenerate nucleotides in a FASTA file with random nucleotides (respecting IUPAC codes)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# | |
# You can run this script as follows: | |
# $ ./rmdegen [input FASTA] [output FASTA] | |
import sys | |
import time | |
import random | |
import pyfastx | |
from collections import Counter | |
random.seed(333) | |
assert len(sys.argv) == 3, "Run this as rmdegen [input FASTA] [output FASTA]" | |
INFILE = sys.argv[1] | |
OUTFILE = sys.argv[2] | |
def get_random_nt(degen): | |
# based on https://www.bioinformatics.org/sms/iupac.html, as of April 2025 | |
if degen == "N": | |
# we check N first because it is the most common of these in practice | |
return random.choice("ACGT") | |
elif degen == "R": | |
return random.choice("AG") | |
elif degen == "Y": | |
return random.choice("CT") | |
elif degen == "S": | |
return random.choice("GC") | |
elif degen == "W": | |
return random.choice("AT") | |
elif degen == "K": | |
return random.choice("GT") | |
elif degen == "M": | |
return random.choice("AC") | |
elif degen == "B": | |
return random.choice("CGT") | |
elif degen == "D": | |
return random.choice("AGT") | |
elif degen == "H": | |
return random.choice("ACT") | |
elif degen == "V": | |
return random.choice("ACG") | |
else: | |
raise ValueError(f"Character {degen} not in IUPAC code?") | |
def pct_str(n, d): | |
pct = 100 * (n / d) | |
return f"{n:,} / {d:,} ({pct:.2f}%)" | |
prefix = "-" * 79 | |
# Based on https://stackoverflow.com/a/34778558 -- we want to append each | |
# randomized sequence to the file | |
t0 = time.time() | |
with open(OUTFILE, "w") as out_fh: | |
print(f"Loading and (if needed) indexing {INFILE}...") | |
indexed_fasta = pyfastx.Fasta(INFILE) | |
num_seqs = len(indexed_fasta.keys()) | |
print(f"Contains {num_seqs:,} sequence(s).") | |
for si, seq_name in enumerate(indexed_fasta.keys(), 1): | |
seq_len = len(indexed_fasta[seq_name]) | |
desc = indexed_fasta[seq_name].description | |
print( | |
f'{prefix}\nOn seq {pct_str(si, num_seqs)}: "{desc}" ' | |
f"({seq_len:,} nt)..." | |
) | |
st0 = time.time() | |
degen_ctr = Counter() | |
nd = 0 | |
out_fh.write(f">{desc}_n2r\n") | |
in_seq = str(indexed_fasta[seq_name]).upper() | |
out_seq = "" | |
for ci, c in enumerate(in_seq, 1): | |
if ci % 1000000 == 0: | |
# ci is 1-indexed | |
print(f" On nt {pct_str(ci, seq_len)}...") | |
if c in "ACGT": | |
out_seq += c | |
else: | |
degen_ctr[c] += 1 | |
nd += 1 | |
out_seq += get_random_nt(c) | |
out_fh.write(f"{out_seq}\n") | |
if len(degen_ctr) > 0: | |
suffix = f"Stats: {degen_ctr}" | |
adj = "new" | |
else: | |
suffix = "No replacements done." | |
adj = "old" | |
print( | |
f"{pct_str(nd, seq_len)} of the nts in this seq were degenerate. " | |
f"{suffix}" | |
) | |
print(f"Wrote out the {adj} sequence to {OUTFILE}.") | |
st1 = time.time() | |
print(f'Processing "{desc}" took {st1 - st0:,.2f} sec.') | |
t1 = time.time() | |
print(f"Took {t1 - t0:,.2f} sec. to process {si:,} seq(s) in {INFILE}.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment