Last active
September 25, 2025 15:25
-
-
Save ogrisel/936d5f455969fef19629d89df4e1db54 to your computer and use it in GitHub Desktop.
Non unique Markov blanket example.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import numpy as np | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import cross_validate | |
rng = np.random.default_rng(0) | |
Z = rng.random((100_000, 2)) | |
# The target is the sum of the two latent variables. | |
y = Z.sum(axis=1) | |
# The observed features are quadratic polynomials of the latent variables. | |
X = np.hstack([Z**2, Z[:, [0]] * Z[:, [1]]]) | |
assert X.shape == (100_000, 3) | |
# %% [markdown] | |
# | |
# Using all features leads to a near perfect $R^2$ because all | |
# the target variance can be explained by the information contained in the | |
# features (and the RF model is expressive enough, and the sample size is | |
# large enough). | |
# %% | |
cv_results = cross_validate( | |
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0), | |
X, | |
y, | |
scoring="r2", | |
return_estimator=True, | |
cv=2, | |
) | |
print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}") | |
for rf in cv_results['estimator']: | |
print(rf.feature_importances_.round(3)) | |
# %% [markdown] | |
# | |
# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because | |
# all the target variance can be explained by the information contained in any pair of | |
# features (and the RF model is expressive enough, and the sample size is large enough). | |
# %% | |
for feature_set in ([0, 1], [1, 2], [0, 2]): | |
cv_results = cross_validate( | |
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0), | |
X[:, feature_set], | |
y, | |
scoring="r2", | |
cv=2, | |
return_estimator=True, | |
) | |
print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}") | |
for rf in cv_results['estimator']: | |
print(rf.feature_importances_.round(3)) | |
# %% [markdown] | |
# | |
# Any choice of individual feature leads to a poor $R^2$ because | |
# the target cannot be written as a function of any individual feature. | |
# %% | |
for i in range(3): | |
cv_results = cross_validate( | |
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0), | |
X[:, [i]], | |
y, | |
scoring="r2", | |
cv=2, | |
return_estimator=True, | |
) | |
print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}") | |
for rf in cv_results['estimator']: | |
print(rf.feature_importances_.round(3)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment