Skip to content

Instantly share code, notes, and snippets.

View matteocourthoud's full-sized avatar

Matteo Courthoud matteocourthoud

View GitHub Profile
lambdas = {}
for city in cities:
mse_pre = synth_predict(df, SyntheticControl(), city, treatment_year).mse
mse_tot = np.mean((df[f'Synthetic {city}'] - df[city])**2)
lambdas[city] = (mse_tot - mse_pre) / mse_pre
print(f"p-value: {np.mean(np.fromiter(lambdas.values(), dtype='float') > lambdas[treated_city]):.4}")
def plot_difference(df, city, year, vline=True, hline=True, **kwargs):
sns.lineplot(x=df['year'], y=df[city] - df[f'Synthetic {city}'], **kwargs)
if vline:
plt.axvline(x=year, ls=":", color='C2', label='Self-driving cars', lw=3, zorder=100)
plt.legend()
if hline: sns.lineplot(x=df['year'], y=0, lw=3, color='k', zorder=1)
plt.title("Estimated effect of self-driving cars");
from toolz import partial
from scipy.optimize import fmin_slsqp
class SyntheticControl():
# Loss function
def loss(self, W, X, y) -> float:
return np.sqrt(np.mean((y - X.dot(W))**2))
# Fit model
def synth_predict(df, model, city, year):
other_cities = [c for c in cities if c not in ['year', city]]
y = df.loc[df['year'] <= year, city]
X = df.loc[df['year'] <= year, other_cities]
df[f'Synthetic {city}'] = model.fit(X, y).predict(df[other_cities])
return model
def plot_lines(df, line1, line2, year, hline=True):
sns.lineplot(x=df['year'], y=df[line1].values, label=line1)
sns.lineplot(x=df['year'], y=df[line2].values, label=line2)
plt.axvline(x=year, ls=":", color='C2', label='Self-Driving Cars', zorder=1)
plt.legend();
plt.title("Average revenue per day (in M$)");
from joblib import Parallel, delayed
def simulate_estimators(X_e, X_mu, D, y):
r = Parallel(n_jobs=8)(delayed(compare_estimators)(X_e, X_mu, D, y, i) for i in range(100))
df_tau = pd.DataFrame(r, columns=['S-learn', 'IPW', 'AIPW'])
plot = sns.boxplot(data=pd.melt(df_tau), x='variable', y='value', linewidth=2);
plot.set(title="Distribution of $\hat τ$ and its components", xlabel='', ylabel='')
plot.axhline(2, c='r', ls=':');
def compare_estimators(X_e, X_mu, D, y, seed):
df = dgp_darkmode().generate_data(seed=seed)
e = estimate_e(df, X_e, D, LogisticRegression())
mu0, mu1 = estimate_mu(df, X_mu, D, y, LinearRegression())
slearn = mu1 - mu0
ipw = (df[D] / e - (1-df[D]) / (1-e)) * df[y]
aipw = slearn + df[D] / e * (df[y] - mu1) - (1-df[D]) / (1-e) * (df[y] - mu0)
return np.mean((slearn, ipw, aipw), axis=1)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegressionCV
def X_learner(df, model, y, D, X):
temp = dgp.generate_data(true_te=True).sort_values(X)
# Mu
mu0 = model.fit(temp.loc[temp[D]==0, X], temp.loc[temp[D]==0, y])
temp['mu0_hat_'] = mu0.predict(temp[X])
mu1 = model.fit(temp.loc[temp[D]==1, X], temp.loc[temp[D]==1, y])
def T_learner(df, model, y, D, X):
temp = dgp.generate_data(true_te=True).sort_values(X)
mu0 = model.fit(temp.loc[temp[D]==0, X], temp.loc[temp[D]==0, y])
temp['mu0_hat'] = mu0.predict(temp[X])
mu1 = model.fit(temp.loc[temp[D]==1, X], temp.loc[temp[D]==1, y])
temp['mu1_hat'] = mu1.predict(temp[X])
plot_TE(temp, true_te=True)
def S_learner(dgp, model, y, D, X):
temp = dgp.generate_data(true_te=True).sort_values(X)
mu = model.fit(temp[X + [D]], temp[y])
temp['mu0_hat'] = mu.predict(temp[X + [D]].assign(premium=0))
temp['mu1_hat'] = mu.predict(temp[X + [D]].assign(premium=1))
plot_TE(temp, true_te=True)