Skip to content

Instantly share code, notes, and snippets.

@sergeyf
Last active December 18, 2017 21:43
Show Gist options
  • Save sergeyf/4bb4b3141fc0ac583ddfa929a9da19f8 to your computer and use it in GitHub Desktop.
Save sergeyf/4bb4b3141fc0ac583ddfa929a9da19f8 to your computer and use it in GitHub Desktop.
'''
References:
https://medium.com/teconomics-blog/using-ml-to-resolve-experiments-faster-bd8053ff602e
https://insightr.wordpress.com/2017/06/28/cross-fitting-double-machine-learning-estimator/
https://arxiv.org/pdf/1608.00060.pdf
'''
import numpy as np
from sklearn.linear_model import LassoCV, LinearRegression, BayesianRidge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_predict
# make some data
# ref: https://arxiv.org/pdf/1712.04912.pdf
def friedman_function(n=500, d=6, sigma=0.1):
assert d >= 5
X = np.random.random(size=(n, d))
noise = np.random.randn(n)
b = 10*np.sin(np.pi * X[:, 0] * X[:, 1]) + 20*(X[:, 2] - 0.5)**2 + 10*X[:, 3] + 5*X[:, 4]
e = np.sin(np.pi * X[:, 0] * X[:, 1])
tau = (X[:, 0] + X[:, 1]) / 2.0
T = np.random.binomial(n=1, p=e)
y = b + (T - 0.5)*tau + sigma*noise
return X, y, T, tau
X, y, T, tau = friedman_function(n=500, d=6, sigma=0.1)
'''
Approach 1: Double Selection with LassoCV
'''
lasso_y = LassoCV(normalize=True, cv=10).fit(X, y)
H = lasso_y.coef_.nonzero()[0]
lasso_T = LassoCV(normalize=True, cv=10).fit(X, T)
K = lasso_T.coef_.nonzero()[0]
# get union of H and K
H_union_K = np.sort(list(set(H).union(set(K))))
X_sub = np.hstack((X[:, H_union_K], T[:, np.newaxis]))
theta_est_ds = LinearRegression().fit(X_sub, y).coef_[-1]
'''
Approach 2: Double Machine Learning with Random Forest
'''
def get_theta_est(X_train, y_train, T_train, X_test, y_test, T_test, regressor, classifier):
# ref: https://arxiv.org/pdf/1608.00060.pdf
y_test_pred = regressor.fit(X_train, y_train).predict(X_test)
T_test_pred = classifier.fit(X_train, T_train).predict_proba(X_test)[:, 1] # w - p(w=1|x)
# eq (1.5)
V_hat = T_test - T_test_pred
theta_est = np.mean(V_hat * (y_test - y_test_pred)) / np.mean(V_hat * T_test)
return theta_est
regressor = RandomForestRegressor(n_estimators=100)
classifier = RandomForestClassifier(n_estimators=100)
splitter = KFold(n_splits=10, shuffle=True)
theta_ests = []
for tr, ts in splitter.split(X):
theta_est = get_theta_est(X[tr, :], y[tr], T[tr], X[ts, :], y[ts], T[ts], regressor, classifier)
theta_ests.append(theta_est)
theta_est_dml = np.mean(theta_ests)
print('Average ITE', np.mean(tau))
print('Estimate of treatment effect using double selection with LassoCV:', theta_est_ds)
print('Estimate of treatment effect using double machine learning with Random Forest:', theta_est_dml)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment