Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Last active September 25, 2025 15:25
Show Gist options
  • Save ogrisel/936d5f455969fef19629d89df4e1db54 to your computer and use it in GitHub Desktop.
Save ogrisel/936d5f455969fef19629d89df4e1db54 to your computer and use it in GitHub Desktop.
Non unique Markov blanket example.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# %%
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
rng = np.random.default_rng(0)
Z = rng.random((100_000, 2))
# The target is the sum of the two latent variables.
y = Z.sum(axis=1)
# The observed features are quadratic polynomials of the latent variables.
X = np.hstack([Z**2, Z[:, [0]] * Z[:, [1]]])
assert X.shape == (100_000, 3)
# %% [markdown]
#
# Using all features leads to a near perfect $R^2$ because all
# the target variance can be explained by the information contained in the
# features (and the RF model is expressive enough, and the sample size is
# large enough).
# %%
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X,
y,
scoring="r2",
return_estimator=True,
cv=2,
)
print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
# %% [markdown]
#
# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
# all the target variance can be explained by the information contained in any pair of
# features (and the RF model is expressive enough, and the sample size is large enough).
# %%
for feature_set in ([0, 1], [1, 2], [0, 2]):
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X[:, feature_set],
y,
scoring="r2",
cv=2,
return_estimator=True,
)
print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
# %% [markdown]
#
# Any choice of individual feature leads to a poor $R^2$ because
# the target cannot be written as a function of any individual feature.
# %%
for i in range(3):
cv_results = cross_validate(
RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
X[:, [i]],
y,
scoring="r2",
cv=2,
return_estimator=True,
)
print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
for rf in cv_results['estimator']:
print(rf.feature_importances_.round(3))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment