Created
March 5, 2018 19:35
-
-
Save conormm/5adce356ca98b94ad381cc8118f27f22 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# inspired by these tweets: | |
#The idea is very simple. You want to compare two groups' outcomes on some metric y. | |
#But the two groups are quite different in their covariates X. | |
#One approach is to predict y given X using a random forest, | |
#saving the proximity matrix. This i,jth entry in this matrix is the | |
#proportion of terminal nodes shared by observations i and j. | |
#That is, "how similar" they are in some random forest space. | |
#Now, for each person in group B, select the corresponding person in group A that is nearest in this this space. | |
#And compute your differences on this subset | |
#There's your synthetic control group. The neat thing about this 'distance', | |
#unlike both propensity score/Mahalanobis distance matching, is that it's | |
#a) the distance in the X space, and | |
#b) _supervised_--it's the similarity in terms of the Xs that matter to the prediction of y. | |
library(randomForest) | |
library(dplyr) | |
library(janitor) | |
library(purrr) | |
set.seed(123) | |
df <- clean_names(as_data_frame(iris))[, 1:4] | |
group <- sample(c("A", "B"), 1, size = nrow(df)) | |
group_ix_a <- group == 'A' | |
group_ix_b <- group == 'B' | |
fit <- randomForest(sepal_length ~ ., data = df, proximity=TRUE) | |
prox_mat <- fit$proximity | |
diag(prox_mat) <- 0 | |
#prox_mat <- cbind(prox_mat, group) | |
for (i in seq_along(group)) { | |
if (group[i] == "A") { | |
prox_mat[i, group_ix_a] <- 0L | |
} else { | |
prox_mat[i, group_ix_b] <- 0L | |
} | |
} | |
closest <- unname( | |
apply(prox_mat, 1, function(x) which(x == max(x, na.rm = TRUE))) | |
) | |
closest_ob <- data.frame( | |
ix_i = row.names(df) | |
) %>% | |
mutate( | |
closest_ix_j = closest, | |
y_i = df$sepal_length, | |
y_j = df$sepal_length[closest] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment