Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active February 19, 2025 08:23
Show Gist options
  • Save afrendeiro/a074de7513417109441858716667992a to your computer and use it in GitHub Desktop.
Save afrendeiro/a074de7513417109441858716667992a to your computer and use it in GitHub Desktop.
Testing out ILR transformation and compositional data testing
# /// script
# dependencies = [
# "numpy",
# "pandas",
# "statsmodels",
# "scikit-learn",
# "scikit-bio",
# "matplotlib",
# ]
# ///
# Run with: uv run --script modeling_compositional_data.py
import numpy as np
import pandas as pd
import statsmodels.api as sm
from skbio.stats.composition import ilr
from sklearn import set_config
from sklearn.decomposition import PCA
from skbio.stats.composition import dirmult_ttest, ancom
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(24)
set_config(transform_output="pandas")
figkws = dict(bbox_inches="tight", dpi=200)
def main():
n_obs = 5000
# len == (parts - 1)
beta_ilr = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 5, 5, 10, 10, -1, -1, -5, -5, -10, -10]
# len == parts
shapes = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3]
scales = None
intercept = 0
X, Y = generate_synthetic_data(n_obs, beta_ilr, intercept, shapes, scales)
Xt = pd.DataFrame(
ilr(X), index=X.index, columns=[f"ilr_{i}" for i in range(1, X.shape[1])]
)
# Observe the 'real' difference between groups in the two spaces
mean = X.mean()
X.groupby(Y).mean().T
X.groupby(Y).mean().T.apply(lambda x: np.log(x[1] / x[0]), axis=1)
meant = Xt.mean()
Xt.groupby(Y).mean().T
Xt.groupby(Y).mean().T.apply(lambda x: x[1] - x[0], axis=1)
# Try to relate the two spaces
corr = X.join(Xt * -1).corr('spearman').loc[X.columns, Xt.columns]
g = sns.clustermap(corr, center=0, cmap="vlag", dendrogram_ratio=0.05, row_cluster=False, col_cluster=False, figsize=(5, 5), cbar_kws={'label': 'Spearman'})
g.figure.savefig("modeling_proportional_data.feature_corr.svg", **figkws)
# PCA
model1 = PCA()
pca1 = model1.fit_transform((X - X.mean()) / X.std())
model2 = PCA()
pca2 = model2.fit_transform((Xt - Xt.mean()) / Xt.std())
fig, axes = plt.subplots(1, 3, figsize=(3 * 4, 1 * 4))
axes[0].scatter(
pca1.iloc[:, 0], pca1.iloc[:, 1], label="raw", s=1, alpha=0.2, rasterized=True
)
axes[1].scatter(
pca2.iloc[:, 0], pca2.iloc[:, 1], label="ilr", s=1, alpha=0.2, rasterized=True
)
axes[2].scatter(
model1.explained_variance_ratio_[:-1], model2.explained_variance_ratio_
)
vmin, vmax = axes[2].get_xlim()
axes[2].set(xlim=(vmin, vmax), ylim=(vmin, vmax))
axes[2].plot([vmin, vmax], [vmin, vmax], color="grey", ls="--")
axes[0].set(xlabel="PC1", ylabel="PC2", title="raw")
axes[1].set(xlabel="PC1", ylabel="PC2", title="ilr")
axes[2].set(xlabel="raw", ylabel="ilr", title="variance ratio")
fig.savefig("modeling_proportional_data.pca.svg", **figkws)
# Logistic regression
llf1, res1 = fit_logistic_regression(X, Y)
llf2, res2 = fit_logistic_regression(Xt, Y)
for d in [res1, res2]:
d["-log10(P)"] = (
(-np.log10(d["P>|z|"])).replace([np.inf], 300).replace([-np.inf], -300)
)
fig, axes = plt.subplots(2, 2, figsize=(2 * 4, 2 * 4))
axes[0, 0].scatter(mean, res1.drop("const").loc[:, "Coef."], label="raw")
axes[0, 1].scatter(meant, res2.drop("const").loc[:, "Coef."], label="ilr")
for llf, ax in zip([llf1, llf2], axes[0]):
ax.set(xlabel="mean", ylabel="coefficient", title=f"log-likelihood: {llf:.2f}")
axes[1, 0].scatter(
res1.drop("const").loc[:, "Coef."],
res1.drop("const").loc[:, "-log10(P)"],
label="raw",
)
axes[1, 1].scatter(
res2.drop("const").loc[:, "Coef."],
res2.drop("const").loc[:, "-log10(P)"],
label="ilr",
)
for ax in axes[1]:
ax.set(xlabel="coefficient", ylabel="-log10(P)")
fig.savefig("modeling_proportional_data.svg", **figkws)
t1 = dirmult_ttest(X, Y, 1, 0)
t2 = ancom(X, Y)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def generate_synthetic_data(n_obs, beta_ilr, intercept=0.0, shapes=None, scales=None):
n_parts = len(beta_ilr) + 1
cols = [f"feature_{i}" for i in range(n_parts)]
# Use default parameters if none provided
if shapes is None:
shapes = [2.0] * n_parts
if scales is None:
scales = [1.0] * n_parts
# Generate raw predictors with per-feature parameters
data = {
col: np.random.gamma(shape=shapes[i], scale=scales[i], size=n_obs)
for i, col in enumerate(cols)
}
X_raw = pd.DataFrame(data, columns=cols)
# Normalize rows to obtain compositional data
X = X_raw.div(X_raw.sum(axis=1), axis=0)
# Apply ILR transform. We label ILR coordinates as ilr_1, ilr_2, ... to match the original code.
X_ilr = pd.DataFrame(
ilr(X), index=X.index, columns=[f"ilr_{i}" for i in range(1, n_parts)]
)
# Compute the linear predictor and simulate binary target
lin_pred = intercept + X_ilr.dot(pd.Series(beta_ilr, index=X_ilr.columns))
p = sigmoid(lin_pred)
Y = pd.Series((np.random.rand(n_obs) < p).astype(int), name="target")
return X, Y
def fit_logistic_regression(X, Y):
model = sm.GLM(Y, sm.add_constant(X), family=sm.families.Binomial())
result = model.fit()
return result.llf, result.summary2().tables[1]
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment