conormm · March 5, 2018 19:35
diff --git a/random_forest_proximity_matrix.r b/random_forest_proximity_matrix.r
 # inspired by these tweets:

 #The idea is very simple. You want to compare two groups' outcomes on some metric y. 
 #But the two groups are quite different in their covariates X. 
 #One approach is to predict y given X using a random forest, 
 #saving the proximity matrix. This i,jth entry in this matrix is the
 #proportion of terminal nodes shared by observations i and j. 
 #That is, "how similar" they are in some random forest space. 
 #Now, for each person in group B, select the corresponding person in group A that is nearest in this this space. 
 #And compute your differences on this subset
 #There's your synthetic control group. The neat thing about this 'distance', 
 #unlike both propensity score/Mahalanobis distance matching, is that it's 
 #a) the distance in the X space, and 
 #b) _supervised_--it's the similarity in terms of the Xs that matter to the prediction of y.

 library(randomForest)
 library(dplyr)
 library(janitor)
 library(purrr)

 set.seed(123)

 df <- clean_names(as_data_frame(iris))[, 1:4]
 group <- sample(c("A", "B"), 1, size = nrow(df))
 group_ix_a <- group == 'A' 
 group_ix_b <- group == 'B'
 fit <- randomForest(sepal_length ~ ., data = df, proximity=TRUE)

 prox_mat <- fit$proximity
 diag(prox_mat) <- 0
 #prox_mat <- cbind(prox_mat, group)

 for (i in seq_along(group)) {
  if (group[i] == "A") {
    prox_mat[i, group_ix_a] <- 0L 
  } else {
    prox_mat[i, group_ix_b] <- 0L
  }
 }

 closest <- unname(
  apply(prox_mat, 1, function(x) which(x ==  max(x, na.rm = TRUE)))
  )

 closest_ob <- data.frame(
  ix_i = row.names(df)
  ) %>% 
  mutate(
    closest_ix_j = closest,
    y_i = df$sepal_length, 
    y_j = df$sepal_length[closest]
  )
	# inspired by these tweets:

	#The idea is very simple. You want to compare two groups' outcomes on some metric y.
	#But the two groups are quite different in their covariates X.
	#One approach is to predict y given X using a random forest,
	#saving the proximity matrix. This i,jth entry in this matrix is the
	#proportion of terminal nodes shared by observations i and j.
	#That is, "how similar" they are in some random forest space.
	#Now, for each person in group B, select the corresponding person in group A that is nearest in this this space.
	#And compute your differences on this subset
	#There's your synthetic control group. The neat thing about this 'distance',
	#unlike both propensity score/Mahalanobis distance matching, is that it's
	#a) the distance in the X space, and
	#b) _supervised_--it's the similarity in terms of the Xs that matter to the prediction of y.

	library(randomForest)
	library(dplyr)
	library(janitor)
	library(purrr)

	set.seed(123)

	df <- clean_names(as_data_frame(iris))[, 1:4]
	group <- sample(c("A", "B"), 1, size = nrow(df))
	group_ix_a <- group == 'A'
	group_ix_b <- group == 'B'
	fit <- randomForest(sepal_length ~ ., data = df, proximity=TRUE)

	prox_mat <- fit$proximity
	diag(prox_mat) <- 0
	#prox_mat <- cbind(prox_mat, group)

	for (i in seq_along(group)) {
	if (group[i] == "A") {
	prox_mat[i, group_ix_a] <- 0L
	} else {
	prox_mat[i, group_ix_b] <- 0L
	}
	}

	closest <- unname(
	apply(prox_mat, 1, function(x) which(x == max(x, na.rm = TRUE)))
	)

	closest_ob <- data.frame(
	ix_i = row.names(df)
	) %>%
	mutate(
	closest_ix_j = closest,
	y_i = df$sepal_length,
	y_j = df$sepal_length[closest]
	)