Created
June 26, 2021 19:17
-
-
Save BioSciEconomist/5f18ea7bbd85623bbf579018a0a99e7f to your computer and use it in GitHub Desktop.
Very basic propensity score matching and IPTW analysis with balance diagnostics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex matching.py | |
# | DATE: 6/25/21 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: very basic matching and IPTW analysis with balance diagnostics | |
# *---------------------------------------------------------------- | |
# see: https://stats.stackexchange.com/questions/206832/matched-pairs-in-python-propensity-score-matching | |
# see also: https://gist.github.com/BioSciEconomist/826556977841cbf966f03542b2be0c55 | |
# https://gist.github.com/BioSciEconomist/d97a720c8b58fded158b9cb51acb5a70 | |
# see also: https://medium.com/@bmiroglio/introducing-the-pymatch-package-6a8c020e2009 | |
# !!!!!!! WARNING - THIS IMPLEMENTATION OF MATCHING HAS A LOT OF PROBLEMS | |
# - IT ESSENTIALLY MATCHES WITH REPLACEMENT AND HAS NO WAY OF | |
# SPECIFYING CALIPERS | |
# results are not so different from using R MatchIt using matching with replacement | |
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.neighbors import NearestNeighbors | |
def get_matching_pairs(treated_df, non_treated_df, scaler=True): | |
treated_x = treated_df.values | |
non_treated_x = non_treated_df.values | |
if scaler == True: | |
scaler = StandardScaler() | |
if scaler: | |
scaler.fit(treated_x) | |
treated_x = scaler.transform(treated_x) | |
non_treated_x = scaler.transform(non_treated_x) | |
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(non_treated_x) | |
distances, indices = nbrs.kneighbors(treated_x) | |
indices = indices.reshape(indices.shape[0]) | |
matched = non_treated_df.iloc[indices] | |
return matched | |
# read lalonde data | |
df = pd.read_csv("/Users/mattbogard/Google Drive/Data/lalonde.csv") | |
df['ID'] = range(0,614) # create ID | |
# create propensity score | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
model = smf.glm('treat ~ age + educ + race + married + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit)) | |
result = model.fit() | |
result.summary() | |
# add propensity scores | |
df['ps'] = result.fittedvalues | |
# explore | |
# df.ps.plot( kind='hist', normed=True, bins=30) | |
df.ps.describe() | |
# assess common support | |
df.boxplot(column='ps', by='treat', rot=90) | |
df.boxplot(column='ps', by='married', rot=90) | |
# data prep for matching | |
mask = df['treat'] == 1 | |
trt = df[mask] | |
trt = trt[['ps']] | |
mask = df['treat'] == 0 | |
ctrl = df[mask] | |
ctrl = ctrl[['ps']] | |
matched_df = get_matching_pairs(trt, ctrl) | |
# | |
# TO DO: modify algo to allow caliper based matching | |
# | |
# def get_matching_pairs(treated_df, non_treated_df, caliper, scaler=True): | |
# treated_x = treated_df.values | |
# non_treated_x = non_treated_df.values | |
# if scaler == True: | |
# scaler = StandardScaler() | |
# if scaler: | |
# scaler.fit(treated_x) | |
# treated_x = scaler.transform(treated_x) | |
# non_treated_x = scaler.transform(non_treated_x) | |
# nbrs = NearestNeighbors(n_neighbors=1, radius = caliper, algorithm='ball_tree').fit(non_treated_x) | |
# distances, indices = nbrs.kneighbors(treated_x) | |
# indices = indices.reshape(indices.shape[0]) | |
# matched = non_treated_df.iloc[indices] | |
# return matched | |
# df.ps.std() | |
# matched_df = get_matching_pairs(trt, ctrl,1) | |
# | |
# stack data | |
# | |
df_match = pd.concat([trt,matched_df]) | |
# convert index back to ID | |
df_match['ID'] = df_match.index | |
# add outcomes and tmatching variables | |
xvars = ['ID','treat', 'age', 'educ', 'race', 'married', 'nodegree', 're74', | |
're75', 're78'] | |
df_match = df_match.merge(df[xvars], on = ['ID'], how = 'left') | |
# check | |
df_match.groupby(['treat']).size().reset_index(name='count') | |
# double check | |
chk = df_match.merge(df[['ID','ps','treat']], on = ['ID'], how = 'left') | |
# check duplicates | |
tmp = df_match.groupby(['ID']).size().reset_index(name='count') # count duplicates | |
tmp = tmp.sort_values(['count'], ascending=[False]) # sort | |
print(tmp) # check | |
# the duplicates here verify this amounts to matching with replacement | |
#-------------------------------- | |
# balance disgnostics | |
#------------------------------- | |
# | |
# before matching | |
# | |
# create copy of analysis file with covariates of interest | |
tmp = df[['treat','age','educ','race','married','nodegree','re74','re75']] | |
tmp =pd.get_dummies(tmp,columns = ['race']) | |
# test group means | |
mask = tmp['treat'] == 1 | |
tmp1A = tmp[mask] | |
tmp2A = pd.DataFrame(tmp1A.mean()) | |
tmp2A.reset_index(inplace=True) | |
tmp2A.columns = ['varname','Mean A'] | |
# control group means | |
mask = tmp['treat'] == 0 | |
tmp1B = tmp[mask] | |
tmp2B = pd.DataFrame(tmp1B.mean()) | |
tmp2B.reset_index(inplace=True) | |
tmp2B.columns = ['varname','Mean B'] | |
# bring data togehter in a wide file | |
tmp3 = pd.merge(tmp2A, tmp2B[['varname','Mean B']], on='varname', how='left') | |
tmp3['diff'] = tmp3['Mean B'] - tmp3['Mean A'] | |
# note: According to Ho et al.'s article (http://imai.princeton.edu/research/files/matchit.pdf) | |
# "the standardize = TRUE option will print out standardized versions of the balance measures, | |
# where the mean difference is standardized (divided) by the standard deviation in the original | |
# treated group." | |
# test group sd | |
mask = tmp['treat'] == 1 | |
tmp1A = tmp[mask] | |
tmp2A = pd.DataFrame(tmp1A.std()) | |
tmp2A.reset_index(inplace=True) | |
tmp2A.columns = ['varname','StdDev A'] | |
# add sd | |
tmp4 = pd.merge(tmp3, tmp2A[['varname','StdDev A']], on='varname', how='left') | |
# calculate absolute standardized mean difference | |
tmp4['ASMD'] = abs(tmp4['diff']/tmp4['StdDev A']) | |
balance_unmatched = tmp4 | |
# | |
# after matching | |
# | |
# create copy of analysis file with covariates of interest | |
tmp = df_match[['treat','age','educ','race','married','nodegree','re74','re75']] | |
# test group means | |
mask = tmp['treat'] == 1 | |
tmp1A = tmp[mask] | |
tmp2A = pd.DataFrame(tmp1A.mean()) | |
tmp2A.reset_index(inplace=True) | |
tmp2A.columns = ['varname','Mean A'] | |
# control group means | |
mask = tmp['treat'] == 0 | |
tmp1B = tmp[mask] | |
tmp2B = pd.DataFrame(tmp1B.mean()) | |
tmp2B.reset_index(inplace=True) | |
tmp2B.columns = ['varname','Mean B'] | |
tmp3 = pd.merge(tmp2A, tmp2B[['varname','Mean B']], on='varname', how='left') | |
tmp3['diff'] = tmp3['Mean B'] - tmp3['Mean A'] | |
# note: According to Ho et al.'s article (http://imai.princeton.edu/research/files/matchit.pdf) | |
# "the standardize = TRUE option will print out standardized versions of the balance measures, | |
# where the mean difference is standardized (divided) by the standard deviation in the original | |
# treated group." | |
# test group sd (this is the original unmatched treatment group data) | |
# create copy of original analysis file with covariates of interest | |
tmp = df[['treat','age','educ','race','married','nodegree','re74','re75']] | |
mask = tmp['treat'] == 1 | |
tmp1A = tmp[mask] | |
tmp2A = pd.DataFrame(tmp1A.std()) | |
tmp2A.reset_index(inplace=True) | |
tmp2A.columns = ['varname','StdDev A'] | |
tmp4 = pd.merge(tmp3, tmp2A[['varname','StdDev A']], on='varname', how='left') | |
tmp4['ASMD'] = abs(tmp4['diff']/tmp4['StdDev A']) | |
balance_matched = tmp4 | |
#------------------------------- | |
# estimation of treatment effects | |
#-------------------------------- | |
# regression using smf | |
results = smf.ols('re78 ~ treat', data=df).fit() | |
results.summary() # b_est = -635 | |
# matched results | |
results = smf.ols('re78 ~ treat', data=df_match).fit() | |
results.summary() # b_est = 1788 - this is much higer than ~ 1200 we get from R matchit | |
#------------------------------- | |
# compare to regression with controls | |
#-------------------------------- | |
# regression using smf | |
results = smf.ols('re78 ~ treat + age + educ + race + married + nodegree + re74 + re75', data=df).fit() | |
results.summary() # b_est = 1548 | |
#----------------------------------- | |
# compare to IPTW regression | |
#---------------------------------- | |
# trim weights | |
perc =[.10, .90] | |
df.ps.describe(percentiles = perc) | |
tmp = df | |
tmp['ps'] = np.where(tmp['ps'] < .025, .025,tmp.ps) | |
tmp['ps'] = np.where(tmp['ps'] > .72, .72,tmp.ps) | |
# check | |
tmp.groupby(['treat']).size().reset_index(name='count') | |
tmp.ps.describe() | |
# calculate propensity score weights for ATT (Austin, 2011) | |
tmp['wt'] = np.where(tmp['treat']==1, 1,tmp.ps/(1-tmp.ps)) | |
tmp.wt.describe() # check | |
tmp =pd.get_dummies(tmp,columns = ['race']) | |
# create weighted data frame (controls are weighted up by 'wt' to look like treatments) | |
# from statsmodels.stats.weightstats import DescrStatsW | |
# wdf = DescrStatsW(tmp, weights=tmp.wt, ddof=1) | |
# descriptives/balance with weighted data | |
trt = tmp[tmp.treat == 1] | |
ctrl = tmp[tmp.treat == 0] | |
np.average(trt.race_black, weights = trt.wt) | |
np.average(ctrl.race_black, weights = ctrl.wt) | |
np.average(trt.re74, weights = trt.wt) | |
np.average(ctrl.re74, weights = ctrl.wt) | |
np.average(trt.age, weights = trt.wt) | |
np.average(ctrl.age, weights = ctrl.wt) | |
# iptw regression | |
iptwResults = smf.wls('re78 ~ treat', data=tmp, weights = tmp.wt).fit() | |
iptwResults.summary() # b_est = 1193.77 | |
# this result is closer to the matched results when using R matchit | |
# vs the matched analysis above | |
#---------------------------------- | |
# try pymatch | |
#---------------------------------- | |
# https://medium.com/@bmiroglio/introducing-the-pymatch-package-6a8c020e2009 | |
from pymatch.Matcher import Matcher | |
trt = df[df.treat == 1] | |
ctrl = df[df.treat == 0] | |
m = Matcher(trt, ctrl, yvar="re78", exclude=['ID']) | |
# this package starts to get buggy very fast | |
m.predict_scores() | |
m.tune_threshold(method='random') | |
#---------------------------------- | |
# try causal inference package | |
#---------------------------------- | |
# https://matheusfacure.github.io/python-causality-handbook/11-Propensity-Score.html | |
from causalinference import CausalModel | |
import graphviz as gr | |
g = gr.Digraph() | |
g.edge("T", "Y") | |
g.edge("X", "Y") | |
g.edge("X", "P(x)") | |
g.edge("P(x)", "T") | |
g | |
# fit propensity score model | |
model = smf.glm('treat ~ age + educ + race + married + nodegree + re74 + re75', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit)) | |
ps_model = model.fit() | |
ps_model.summary() | |
# add propensity scores | |
df['ps'] = ps_model.fittedvalues | |
df[["treat", "educ", "race","ps"]].head() # check | |
# ps matching | |
cm = CausalModel( | |
Y=df["re78"].values, | |
D=df["treat"].values, | |
X=df[["ps"]].values | |
) | |
cm.est_via_matching(matches=1, bias_adj=True) | |
print(cm.estimates) # the ATT is very similar to the poorly implemented nearest neighbors matching above | |
# documentation | |
# est_via_matching(self, weights='inv', matches=1, bias_adj=False) | |
# Estimates average treatment effects using nearest- neighborhood matching. | |
# Matching is done with replacement. Method supports multiple matching. Correcting bias that arise due to imperfect matches is also supported. For details on methodology, see [1]. | |
# Parameters | |
# weights: str or positive definite square matrix | |
# Specifies weighting matrix used in computing distance measures. Defaults to string ‘inv’, which does inverse variance weighting. String ‘maha’ gives the weighting matrix used in the Mahalanobis metric. | |
# matches: int | |
# Number of matches to use for each subject. | |
# bias_adj: bool | |
# Specifies whether bias adjustments should be attempted. | |
# References | |
# 1 | |
# Imbens, G. & Rubin, D. (2015). Causal Inference in Statistics, Social, and Biomedical Sciences: An Introduction. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment