Created
March 7, 2019 01:24
-
-
Save BioSciEconomist/6b18297700db104e1b26e0cb67967012 to your computer and use it in GitHub Desktop.
Example using Matchit with a custom propensity score metric
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # *----------------------------------------------------------------- | |
| # | PROGRAM NAME: MatchIt custom distance.R | |
| # | DATE: 3/6/19 | |
| # | CREATED BY: MATT BOGARD | |
| # | PROJECT FILE: | |
| # *---------------------------------------------------------------- | |
| # | PURPOSE: demo of R matchit based on: Matching as Nonparametric Preprocessing for | |
| # | Reducing Model Dependence in Parametric Causal Inference by Ho,Imai,King,and Stuart | |
| # | Political Analysis (2007) 15:199-236 See also: http://www.biostat.jhsph.edu/~estuart/ | |
| # *---------------------------------------------------------------- | |
| # rm(list=ls()) # get rid of any existing data | |
| # Data: lalonde - used by Dehejia and Wahba (1999) to evaluate propensity score matching. | |
| # Ref: Dehejia, Rajeev and Sadek Wahba. 1999.`Causal Effects in Non-Experimental Studies: Re-Evaluating | |
| # the Evaluation of Training Programs.'' Journal of the American Statistical Association 94 (448): | |
| # 1053-1062 | |
| # Descriptions: | |
| # treat - an indicator variable for treatment status. | |
| # age - age in years. | |
| # educ - years of schooling. | |
| # black - indicator variable for blacks. | |
| # hisp- indicator variable for Hispanics. | |
| # married - indicator variable for martial status. | |
| # nodegr - indicator variable for high school diploma. | |
| # re74 - real earnings in 1974. | |
| # re75 - real earnings in 1975. | |
| # re78 - real earnings in 1978. | |
| # settings and packages | |
| options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
| library(MatchIt) | |
| library(dplyr) | |
| #----------------------------- | |
| # data loading and customization | |
| #----------------------------- | |
| data("lalonde") # load data | |
| head(lalonde) # check out first few rows | |
| df <- lalonde # resave data frame to play with | |
| ### lets do a little customization to this demo data set | |
| # convert index to column - because I like my unique identifiers to actually be a column | |
| mbrid <- rownames(df) # extract identifiers from the rowname index | |
| df <- cbind(mbrid,df) # add to dataframe as a column | |
| df$mbrid <- as.character(df$mbrid) # character conversion | |
| head(df) # check | |
| #-------------------------------------------- | |
| # assess unmatched data | |
| #------------------------------------------- | |
| # what is the raw difference in outcome re78 between treatment and controls | |
| df%>% | |
| group_by(treat) %>% # group by treatment | |
| summarize(AVGre78 = mean(re78)) | |
| # treatment group appears to have lower income in 1978 (our outcome variable) | |
| # is this difference significant | |
| summary(lm(re78 ~ treat, data = df)) | |
| # difference with controls | |
| summary(lm(re78 ~ treat + age + educ + black + hispan + married + nodegree + re74 + re75, data = df)) | |
| #---------------------------------------------- | |
| # match on propensity scores - default | |
| #--------------------------------------------- | |
| # reference model - we really don't need this this is just for reference | |
| summary(glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df)) | |
| # match data using nearest neighbor 1:1 greedy matching using logistic regression and caliper = .10 sd(ps) | |
| set.seed(1234) | |
| m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1, | |
| distance= "logit", caliper=0.10, replace= F) | |
| summary(m.out) # check balance for each iteration above to determine which approach gives the most balance | |
| # (notice the improvement of matches with caliper = .10 vs .25) | |
| summary(m.out, standardize = TRUE) | |
| #---------------------------------------------- | |
| # match on propensity scores - customized | |
| #--------------------------------------------- | |
| # estimate ps model | |
| psmodel <- glm(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, family = "binomial", data = df) | |
| pscores <- predict(psmodel, newdata = df, type = "response") # get probabilities (ps) from model | |
| hist(pscores) # view distribution | |
| # according to some of the literature we 'should' match on the logit of the ps using a caliper = .2*sd of the logit of ps | |
| # Austin, P. C. (2011), Optimal caliper widths for propensity‐score matching when estimating differences in means and | |
| # differences in proportions in observational studies. Pharmaceut. Statist., 10: 150-161. doi:10.1002/pst.433 | |
| logitps <- -log(1/pscores ─ 1) # calculate the logit of the propensity score | |
| hist(logitps) # view distribution | |
| # match using these scores | |
| set.seed(1234) | |
| m.out <- matchit(treat ~ age + educ + black + hispan + married + nodegree + re74 + re75, data=df,method= "nearest", ratio=1, | |
| distance= logitps, caliper=0.05, replace= F) | |
| summary(m.out, standardize = TRUE) # note the traditional caliper of .2SD of the logit of ps gave terrible results compared | |
| # to .05 in terms of balance | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment