Skip to content

Instantly share code, notes, and snippets.

@apcamargo
Created November 17, 2024 04:59
Show Gist options
  • Save apcamargo/e39dfd15ff673ae27b5a94aa6437d8c6 to your computer and use it in GitHub Desktop.
Save apcamargo/e39dfd15ff673ae27b5a94aa6437d8c6 to your computer and use it in GitHub Desktop.
Calculate the number of effective sequences (Neff) of a A3M multiple sequence alignment
#!/usr/bin/env python
import math
import re
import click
from scipy.cluster.hierarchy import fcluster, linkage
from skbio import DistanceMatrix, Protein, TabularMSA, io
from skbio.sequence.distance import hamming
def parse_msa_file(infile):
"""
Read sequences from a multiple sequence alignment (MSA) file.
Parameters
----------
infile : str
file path to input MSA file in A3M format (like FASTA format, but
lowercase letters will be dropped)
Returns
-------
skbio TabularMSA
"""
seqs = []
for seq in io.read(click.open_file(infile), format="fasta"):
seqs.append(Protein(re.sub("[a-z]", "", str(seq)), metadata=seq.metadata))
return TabularMSA(seqs)
def distance_matrix(msa, ignore_sequence_ids=False):
"""
Compute Hamming distance matrix of an MSA.
Parameters
----------
msa : skbio TabularMSA
Aligned sequences for calculating pairwise Hamming distances
ignore_sequence_ids : bool
Default is False. If true, ignore sequence identifier of alignment.
Useful if identifier got truncated by alignment producing program such
that different sequences collapse to the same identifier.
Returns
-------
skbio DistanceMatrix
"""
key = "id"
if ignore_sequence_ids:
key = None
return DistanceMatrix.from_iterable(msa, hamming, key=key, validate=False)
def cluster_sequences(dm, cutoff):
"""
Perform hierarchical clustering based on a distance matrix.
Parameters
----------
dm : skbio DistanceMatrix
aligned sequences for calculating pairwise Hamming distances
cutoff : float
sequence identity cutoff for defining clusters
Returns
-------
list of int
flat cluster numbers to which sequences are assigned
Notes
-----
returns an empty list if the distance matrix is empty (e.g., there is only
one input sequence)
"""
t = 1.0 - cutoff
return (
list(fcluster(linkage(dm.condensed_form()), t, criterion="distance"))
if dm.size > 1
else []
)
def effective_number_sequences(clusters, length, n_clu=False):
"""
Calculate effective number of sequences based on a clustering scheme.
Parameters
----------
clusters : list of tuple
clustering scheme (list of (sequence ID, flat cluster number assigned
to the sequence))
length : int
length (number of columns) of the multiple sequence alignment
n_clu : bool
return n_cluster instead of neff (default: False)
Returns
-------
float or int
effective number of sequences size: neff = n_cluster / sqrt(length)
or number of clusters (n_cluster)
"""
nc = len(set(clusters))
return nc if n_clu else nc / math.sqrt(length)
@click.command(
context_settings={"show_default": True, "help_option_names": ["-h", "--help"]}
)
@click.argument(
"input",
type=click.Path(resolve_path=True, readable=True, allow_dash=True, exists=True),
)
@click.option(
"--cutoff",
"-c",
required=False,
type=click.FloatRange(min=0.0, max=1.0),
default=0.8,
help=("Sequence identity cutoff for clustering sequences"),
)
@click.option("--name", "-n", required=False, type=str, help=("Name of the MSA."))
def _calculate_neff(input, cutoff, name):
"""
Calculate the number of effective sequences (Neff) of a A3M multiple sequence
alignment.
"""
msa = parse_msa_file(input)
hdm = distance_matrix(msa)
clu = cluster_sequences(hdm, cutoff)
neff = effective_number_sequences(clu, msa.shape[1])
if name:
click.echo(f"{name}\t{neff:.4f}")
else:
click.echo(f"{neff:.4f}")
if __name__ == "__main__":
_calculate_neff()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment