ogrisel · September 25, 2025 15:25
diff --git a/three_blankets.ipynb b/three_blankets.ipynb
diff --git a/three_blankets.py b/three_blankets.py
 # %%
 import numpy as np
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import cross_validate

 rng = np.random.default_rng(0)
 Z = rng.random((100_000, 2))

 # The target is the sum of the two latent variables.
 y = Z.sum(axis=1)

 # The observed features are quadratic polynomials of the latent variables.
 X = np.hstack([Z**2, Z[:, [0]] * Z[:, [1]]])
 assert X.shape == (100_000, 3)


 # %% [markdown]
 #
 # Using all features leads to a near perfect $R^2$ because all
 # the target variance can be explained by the information contained in the
 # features  (and the RF model is expressive enough, and the sample size is
 # large enough).

 # %%
 cv_results = cross_validate(
    RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
    X,
    y,
    scoring="r2",
    return_estimator=True,
    cv=2,
 )
 print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
 for rf in cv_results['estimator']:
    print(rf.feature_importances_.round(3))

 # %% [markdown]
 #
 # Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
 # all the target variance can be explained by the information contained in any pair of
 # features (and the RF model is expressive enough, and the sample size is large enough).

 # %%
 for feature_set in ([0, 1], [1, 2], [0, 2]):
    cv_results = cross_validate(
        RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
        X[:, feature_set],
        y,
        scoring="r2",
        cv=2,
        return_estimator=True,
    )
    print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
    for rf in cv_results['estimator']:
        print(rf.feature_importances_.round(3))

 # %% [markdown]
 #
 # Any choice of individual feature leads to a poor $R^2$ because
 # the target cannot be written as a function of any individual feature.

 # %%
 for i in range(3):
    cv_results = cross_validate(
        RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
        X[:, [i]],
        y,
        scoring="r2",
        cv=2,
        return_estimator=True,
    )
    print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
    for rf in cv_results['estimator']:
        print(rf.feature_importances_.round(3))
	# %%
	import numpy as np
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import cross_validate

	rng = np.random.default_rng(0)
	Z = rng.random((100_000, 2))

	# The target is the sum of the two latent variables.
	y = Z.sum(axis=1)

	# The observed features are quadratic polynomials of the latent variables.
	X = np.hstack([Z*2, Z[:, [0]] Z[:, [1]]])
	assert X.shape == (100_000, 3)


	# %% [markdown]
	#
	# Using all features leads to a near perfect $R^2$ because all
	# the target variance can be explained by the information contained in the
	# features (and the RF model is expressive enough, and the sample size is
	# large enough).

	# %%
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X,
	y,
	scoring="r2",
	return_estimator=True,
	cv=2,
	)
	print(f"R^2 using all 3 features: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))

	# %% [markdown]
	#
	# Using any pair of 2 features out of 3 leads to a near perfect $R^2$ because
	# all the target variance can be explained by the information contained in any pair of
	# features (and the RF model is expressive enough, and the sample size is large enough).

	# %%
	for feature_set in ([0, 1], [1, 2], [0, 2]):
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X[:, feature_set],
	y,
	scoring="r2",
	cv=2,
	return_estimator=True,
	)
	print(f"R^2 using features {feature_set}: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))

	# %% [markdown]
	#
	# Any choice of individual feature leads to a poor $R^2$ because
	# the target cannot be written as a function of any individual feature.

	# %%
	for i in range(3):
	cv_results = cross_validate(
	RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0),
	X[:, [i]],
	y,
	scoring="r2",
	cv=2,
	return_estimator=True,
	)
	print(f"R^2 using feature {i}: {cv_results['test_score'].mean():.3f}")
	for rf in cv_results['estimator']:
	print(rf.feature_importances_.round(3))
No results found