Created
November 17, 2024 04:59
-
-
Save apcamargo/e39dfd15ff673ae27b5a94aa6437d8c6 to your computer and use it in GitHub Desktop.
Calculate the number of effective sequences (Neff) of a A3M multiple sequence alignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import math | |
import re | |
import click | |
from scipy.cluster.hierarchy import fcluster, linkage | |
from skbio import DistanceMatrix, Protein, TabularMSA, io | |
from skbio.sequence.distance import hamming | |
def parse_msa_file(infile): | |
""" | |
Read sequences from a multiple sequence alignment (MSA) file. | |
Parameters | |
---------- | |
infile : str | |
file path to input MSA file in A3M format (like FASTA format, but | |
lowercase letters will be dropped) | |
Returns | |
------- | |
skbio TabularMSA | |
""" | |
seqs = [] | |
for seq in io.read(click.open_file(infile), format="fasta"): | |
seqs.append(Protein(re.sub("[a-z]", "", str(seq)), metadata=seq.metadata)) | |
return TabularMSA(seqs) | |
def distance_matrix(msa, ignore_sequence_ids=False): | |
""" | |
Compute Hamming distance matrix of an MSA. | |
Parameters | |
---------- | |
msa : skbio TabularMSA | |
Aligned sequences for calculating pairwise Hamming distances | |
ignore_sequence_ids : bool | |
Default is False. If true, ignore sequence identifier of alignment. | |
Useful if identifier got truncated by alignment producing program such | |
that different sequences collapse to the same identifier. | |
Returns | |
------- | |
skbio DistanceMatrix | |
""" | |
key = "id" | |
if ignore_sequence_ids: | |
key = None | |
return DistanceMatrix.from_iterable(msa, hamming, key=key, validate=False) | |
def cluster_sequences(dm, cutoff): | |
""" | |
Perform hierarchical clustering based on a distance matrix. | |
Parameters | |
---------- | |
dm : skbio DistanceMatrix | |
aligned sequences for calculating pairwise Hamming distances | |
cutoff : float | |
sequence identity cutoff for defining clusters | |
Returns | |
------- | |
list of int | |
flat cluster numbers to which sequences are assigned | |
Notes | |
----- | |
returns an empty list if the distance matrix is empty (e.g., there is only | |
one input sequence) | |
""" | |
t = 1.0 - cutoff | |
return ( | |
list(fcluster(linkage(dm.condensed_form()), t, criterion="distance")) | |
if dm.size > 1 | |
else [] | |
) | |
def effective_number_sequences(clusters, length, n_clu=False): | |
""" | |
Calculate effective number of sequences based on a clustering scheme. | |
Parameters | |
---------- | |
clusters : list of tuple | |
clustering scheme (list of (sequence ID, flat cluster number assigned | |
to the sequence)) | |
length : int | |
length (number of columns) of the multiple sequence alignment | |
n_clu : bool | |
return n_cluster instead of neff (default: False) | |
Returns | |
------- | |
float or int | |
effective number of sequences size: neff = n_cluster / sqrt(length) | |
or number of clusters (n_cluster) | |
""" | |
nc = len(set(clusters)) | |
return nc if n_clu else nc / math.sqrt(length) | |
@click.command( | |
context_settings={"show_default": True, "help_option_names": ["-h", "--help"]} | |
) | |
@click.argument( | |
"input", | |
type=click.Path(resolve_path=True, readable=True, allow_dash=True, exists=True), | |
) | |
@click.option( | |
"--cutoff", | |
"-c", | |
required=False, | |
type=click.FloatRange(min=0.0, max=1.0), | |
default=0.8, | |
help=("Sequence identity cutoff for clustering sequences"), | |
) | |
@click.option("--name", "-n", required=False, type=str, help=("Name of the MSA.")) | |
def _calculate_neff(input, cutoff, name): | |
""" | |
Calculate the number of effective sequences (Neff) of a A3M multiple sequence | |
alignment. | |
""" | |
msa = parse_msa_file(input) | |
hdm = distance_matrix(msa) | |
clu = cluster_sequences(hdm, cutoff) | |
neff = effective_number_sequences(clu, msa.shape[1]) | |
if name: | |
click.echo(f"{name}\t{neff:.4f}") | |
else: | |
click.echo(f"{neff:.4f}") | |
if __name__ == "__main__": | |
_calculate_neff() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment