Last active
February 19, 2025 08:23
-
-
Save afrendeiro/a074de7513417109441858716667992a to your computer and use it in GitHub Desktop.
Testing out ILR transformation and compositional data testing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# dependencies = [ | |
# "numpy", | |
# "pandas", | |
# "statsmodels", | |
# "scikit-learn", | |
# "scikit-bio", | |
# "matplotlib", | |
# ] | |
# /// | |
# Run with: uv run --script modeling_compositional_data.py | |
import numpy as np | |
import pandas as pd | |
import statsmodels.api as sm | |
from skbio.stats.composition import ilr | |
from sklearn import set_config | |
from sklearn.decomposition import PCA | |
from skbio.stats.composition import dirmult_ttest, ancom | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
np.random.seed(24) | |
set_config(transform_output="pandas") | |
figkws = dict(bbox_inches="tight", dpi=200) | |
def main(): | |
n_obs = 5000 | |
# len == (parts - 1) | |
beta_ilr = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 5, 5, 10, 10, -1, -1, -5, -5, -10, -10] | |
# len == parts | |
shapes = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3] | |
scales = None | |
intercept = 0 | |
X, Y = generate_synthetic_data(n_obs, beta_ilr, intercept, shapes, scales) | |
Xt = pd.DataFrame( | |
ilr(X), index=X.index, columns=[f"ilr_{i}" for i in range(1, X.shape[1])] | |
) | |
# Observe the 'real' difference between groups in the two spaces | |
mean = X.mean() | |
X.groupby(Y).mean().T | |
X.groupby(Y).mean().T.apply(lambda x: np.log(x[1] / x[0]), axis=1) | |
meant = Xt.mean() | |
Xt.groupby(Y).mean().T | |
Xt.groupby(Y).mean().T.apply(lambda x: x[1] - x[0], axis=1) | |
# Try to relate the two spaces | |
corr = X.join(Xt * -1).corr('spearman').loc[X.columns, Xt.columns] | |
g = sns.clustermap(corr, center=0, cmap="vlag", dendrogram_ratio=0.05, row_cluster=False, col_cluster=False, figsize=(5, 5), cbar_kws={'label': 'Spearman'}) | |
g.figure.savefig("modeling_proportional_data.feature_corr.svg", **figkws) | |
# PCA | |
model1 = PCA() | |
pca1 = model1.fit_transform((X - X.mean()) / X.std()) | |
model2 = PCA() | |
pca2 = model2.fit_transform((Xt - Xt.mean()) / Xt.std()) | |
fig, axes = plt.subplots(1, 3, figsize=(3 * 4, 1 * 4)) | |
axes[0].scatter( | |
pca1.iloc[:, 0], pca1.iloc[:, 1], label="raw", s=1, alpha=0.2, rasterized=True | |
) | |
axes[1].scatter( | |
pca2.iloc[:, 0], pca2.iloc[:, 1], label="ilr", s=1, alpha=0.2, rasterized=True | |
) | |
axes[2].scatter( | |
model1.explained_variance_ratio_[:-1], model2.explained_variance_ratio_ | |
) | |
vmin, vmax = axes[2].get_xlim() | |
axes[2].set(xlim=(vmin, vmax), ylim=(vmin, vmax)) | |
axes[2].plot([vmin, vmax], [vmin, vmax], color="grey", ls="--") | |
axes[0].set(xlabel="PC1", ylabel="PC2", title="raw") | |
axes[1].set(xlabel="PC1", ylabel="PC2", title="ilr") | |
axes[2].set(xlabel="raw", ylabel="ilr", title="variance ratio") | |
fig.savefig("modeling_proportional_data.pca.svg", **figkws) | |
# Logistic regression | |
llf1, res1 = fit_logistic_regression(X, Y) | |
llf2, res2 = fit_logistic_regression(Xt, Y) | |
for d in [res1, res2]: | |
d["-log10(P)"] = ( | |
(-np.log10(d["P>|z|"])).replace([np.inf], 300).replace([-np.inf], -300) | |
) | |
fig, axes = plt.subplots(2, 2, figsize=(2 * 4, 2 * 4)) | |
axes[0, 0].scatter(mean, res1.drop("const").loc[:, "Coef."], label="raw") | |
axes[0, 1].scatter(meant, res2.drop("const").loc[:, "Coef."], label="ilr") | |
for llf, ax in zip([llf1, llf2], axes[0]): | |
ax.set(xlabel="mean", ylabel="coefficient", title=f"log-likelihood: {llf:.2f}") | |
axes[1, 0].scatter( | |
res1.drop("const").loc[:, "Coef."], | |
res1.drop("const").loc[:, "-log10(P)"], | |
label="raw", | |
) | |
axes[1, 1].scatter( | |
res2.drop("const").loc[:, "Coef."], | |
res2.drop("const").loc[:, "-log10(P)"], | |
label="ilr", | |
) | |
for ax in axes[1]: | |
ax.set(xlabel="coefficient", ylabel="-log10(P)") | |
fig.savefig("modeling_proportional_data.svg", **figkws) | |
t1 = dirmult_ttest(X, Y, 1, 0) | |
t2 = ancom(X, Y) | |
def sigmoid(x): | |
return 1 / (1 + np.exp(-x)) | |
def generate_synthetic_data(n_obs, beta_ilr, intercept=0.0, shapes=None, scales=None): | |
n_parts = len(beta_ilr) + 1 | |
cols = [f"feature_{i}" for i in range(n_parts)] | |
# Use default parameters if none provided | |
if shapes is None: | |
shapes = [2.0] * n_parts | |
if scales is None: | |
scales = [1.0] * n_parts | |
# Generate raw predictors with per-feature parameters | |
data = { | |
col: np.random.gamma(shape=shapes[i], scale=scales[i], size=n_obs) | |
for i, col in enumerate(cols) | |
} | |
X_raw = pd.DataFrame(data, columns=cols) | |
# Normalize rows to obtain compositional data | |
X = X_raw.div(X_raw.sum(axis=1), axis=0) | |
# Apply ILR transform. We label ILR coordinates as ilr_1, ilr_2, ... to match the original code. | |
X_ilr = pd.DataFrame( | |
ilr(X), index=X.index, columns=[f"ilr_{i}" for i in range(1, n_parts)] | |
) | |
# Compute the linear predictor and simulate binary target | |
lin_pred = intercept + X_ilr.dot(pd.Series(beta_ilr, index=X_ilr.columns)) | |
p = sigmoid(lin_pred) | |
Y = pd.Series((np.random.rand(n_obs) < p).astype(int), name="target") | |
return X, Y | |
def fit_logistic_regression(X, Y): | |
model = sm.GLM(Y, sm.add_constant(X), family=sm.families.Binomial()) | |
result = model.fit() | |
return result.llf, result.summary2().tables[1] | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment