BioSciEconomist · March 7, 2019 01:24
diff --git a/MatchIt custom distance.R b/MatchIt custom distance.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: MatchIt custom distance.R
 # | DATE: 3/6/19 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:         
 # *----------------------------------------------------------------
 # | PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for
 # | Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart
 # | Political Analysis (2007) 15:199-236 See also: http://www.biostat.jhsph.edu/~estuart/
 # *----------------------------------------------------------------


 # rm(list=ls()) # get rid of any existing data 


 # Data: lalonde - used by Dehejia and Wahba (1999) to evaluate propensity score matching.

 # Ref: Dehejia, Rajeev and Sadek Wahba. 1999.`Causal Effects in Non-Experimental Studies: Re-Evaluating 
 # the Evaluation of Training Programs.'' Journal of the American Statistical Association 94 (448):
 # 1053-1062

 # Descriptions: 

 # treat - an indicator variable for treatment status.
 # age - age in years.
 # educ - years of schooling.
 # black - indicator variable for blacks.
 # hisp- indicator variable for Hispanics.
 # married - indicator variable for martial status.
 # nodegr - indicator variable for high school diploma.
 # re74 - real earnings in 1974.
 # re75 - real earnings in 1975.
 # re78  - real earnings in 1978.

 # settings and packages

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 library(MatchIt)
 library(dplyr)

 #-----------------------------
 # data loading and customization
 #-----------------------------

 data("lalonde") # load data
 head(lalonde) # check out first few rows

 df <- lalonde # resave data frame to play with

 ### lets do a little customization to this demo data set 

 # convert index to column - because I like my unique identifiers to actually be a column
 mbrid <- rownames(df) # extract identifiers from the rowname index
 df <- cbind(mbrid,df) # add to dataframe as a column
 df$mbrid <- as.character(df$mbrid) # character conversion
 head(df) # check


 #--------------------------------------------
 # assess unmatched data
 #-------------------------------------------

 # what is the raw difference in outcome re78 between treatment and controls
 df%>% 
  group_by(treat) %>%     # group by treatment
  summarize(AVGre78 = mean(re78)) 
 # treatment group appears to have lower income in 1978 (our outcome variable)

 # is this difference significant
 summary(lm(re78 ~  treat, data = df))

 # difference with controls
 summary(lm(re78 ~  treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df))



 #----------------------------------------------
 # match on propensity scores - default
 #---------------------------------------------


 # reference model  - we really don't need this this is just for reference
 summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df))


 # match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps)
 set.seed(1234)
 m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1, 
                 distance= "logit", caliper=0.10, replace= F)

 summary(m.out) # check balance for each iteration above to determine which approach gives the most balance
 # (notice the improvement of matches with caliper = .10 vs .25)

 summary(m.out, standardize = TRUE)


 #----------------------------------------------
 # match on propensity scores - customized
 #---------------------------------------------

 # estimate ps model
 psmodel <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df)

 pscores <- predict(psmodel, newdata = df, type = "response") # get probabilities (ps) from model

 hist(pscores) # view distribution


 # according to some of the literature we 'should' match on the logit of the ps using a caliper = .2*sd of the logit of ps
 # Austin, P. C. (2011), Optimal caliper widths for propensity‐score matching when estimating differences in means and 
 # differences in proportions in observational studies. Pharmaceut. Statist., 10: 150-161. doi:10.1002/pst.433

 logitps <-  -log(1/pscores ─ 1) # calculate the logit of the propensity score
 hist(logitps) # view distribution


 # match using these scores
 set.seed(1234)
 m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1, 
                 distance= logitps, caliper=0.05, replace= F)

 summary(m.out, standardize = TRUE) # note the traditional caliper of .2SD of the logit of ps gave terrible results compared
 # to .05 in terms of balance
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: MatchIt custom distance.R
	# \| DATE: 3/6/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for
	# \| Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart
	# \| Political Analysis (2007) 15:199-236 See also: http://www.biostat.jhsph.edu/~estuart/
	# *----------------------------------------------------------------


	# rm(list=ls()) # get rid of any existing data


	# Data: lalonde - used by Dehejia and Wahba (1999) to evaluate propensity score matching.

	# Ref: Dehejia, Rajeev and Sadek Wahba. 1999.`Causal Effects in Non-Experimental Studies: Re-Evaluating
	# the Evaluation of Training Programs.'' Journal of the American Statistical Association 94 (448):
	# 1053-1062

	# Descriptions:

	# treat - an indicator variable for treatment status.
	# age - age in years.
	# educ - years of schooling.
	# black - indicator variable for blacks.
	# hisp- indicator variable for Hispanics.
	# married - indicator variable for martial status.
	# nodegr - indicator variable for high school diploma.
	# re74 - real earnings in 1974.
	# re75 - real earnings in 1975.
	# re78 - real earnings in 1978.

	# settings and packages

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	library(MatchIt)
	library(dplyr)

	#-----------------------------
	# data loading and customization
	#-----------------------------

	data("lalonde") # load data
	head(lalonde) # check out first few rows

	df <- lalonde # resave data frame to play with

	### lets do a little customization to this demo data set

	# convert index to column - because I like my unique identifiers to actually be a column
	mbrid <- rownames(df) # extract identifiers from the rowname index
	df <- cbind(mbrid,df) # add to dataframe as a column
	df$mbrid <- as.character(df$mbrid) # character conversion
	head(df) # check


	#--------------------------------------------
	# assess unmatched data
	#-------------------------------------------

	# what is the raw difference in outcome re78 between treatment and controls
	df%>%
	group_by(treat) %>% # group by treatment
	summarize(AVGre78 = mean(re78))
	# treatment group appears to have lower income in 1978 (our outcome variable)

	# is this difference significant
	summary(lm(re78 ~ treat, data = df))

	# difference with controls
	summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df))



	#----------------------------------------------
	# match on propensity scores - default
	#---------------------------------------------


	# reference model - we really don't need this this is just for reference
	summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df))


	# match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps)
	set.seed(1234)
	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1,
	distance= "logit", caliper=0.10, replace= F)

	summary(m.out) # check balance for each iteration above to determine which approach gives the most balance
	# (notice the improvement of matches with caliper = .10 vs .25)

	summary(m.out, standardize = TRUE)


	#----------------------------------------------
	# match on propensity scores - customized
	#---------------------------------------------

	# estimate ps model
	psmodel <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df)

	pscores <- predict(psmodel, newdata = df, type = "response") # get probabilities (ps) from model

	hist(pscores) # view distribution


	# according to some of the literature we 'should' match on the logit of the ps using a caliper = .2*sd of the logit of ps
	# Austin, P. C. (2011), Optimal caliper widths for propensity‐score matching when estimating differences in means and
	# differences in proportions in observational studies. Pharmaceut. Statist., 10: 150-161. doi:10.1002/pst.433

	logitps <- -log(1/pscores ─ 1) # calculate the logit of the propensity score
	hist(logitps) # view distribution


	# match using these scores
	set.seed(1234)
	m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1,
	distance= logitps, caliper=0.05, replace= F)

	summary(m.out, standardize = TRUE) # note the traditional caliper of .2SD of the logit of ps gave terrible results compared
	# to .05 in terms of balance
No results found