conjugateprior · August 29, 2017 19:48
diff --git a/messy-merge.R b/messy-merge.R
 library(dplyr)
 library(stringdist) # for approximate matching with 'amatch'

 DB1 <- data_frame(name1 = c("John Q. Smith", "Sajeet Rau",
                            "Federico K. Giordino"),
                  judge_variable = c(1, 2, 3))

 DB2 <- data_frame(name2 = c("John Smith", "John Q. Smith",
                            "F. K. Giordino","Federico K. Giordino"),
                  case_variable = c(1, 1, 1, 2))

 # left join to get exact matches (there's probably an easier way)
 exacts <- left_join(DB2, DB1, by = c('name2' = 'name1'))

 miss_inds <- which(is.na(exacts$judge_variable))
 misses <- exacts$name2[miss_inds]
 targets <- exacts$name2[-miss_inds]

 mtchs <- amatch(misses, targets, maxDist = Inf) # <- tweak distance calc here

 key <- data_frame(old = misses,
                  new_name2 = targets[mtchs])

 # fill in new_name2 from the key and existing name2, then merge
 # DB1 info using new_name2
 disambig <- left_join(DB2, key, by = c('name2' = 'old')) %>%
  mutate(new_name2 = if_else(is.na(new_name2), name2, new_name2)) %>%
  left_join(DB1, by = c("new_name2" = "name1"))
	library(dplyr)
	library(stringdist) # for approximate matching with 'amatch'

	DB1 <- data_frame(name1 = c("John Q. Smith", "Sajeet Rau",
	"Federico K. Giordino"),
	judge_variable = c(1, 2, 3))

	DB2 <- data_frame(name2 = c("John Smith", "John Q. Smith",
	"F. K. Giordino","Federico K. Giordino"),
	case_variable = c(1, 1, 1, 2))

	# left join to get exact matches (there's probably an easier way)
	exacts <- left_join(DB2, DB1, by = c('name2' = 'name1'))

	miss_inds <- which(is.na(exacts$judge_variable))
	misses <- exacts$name2[miss_inds]
	targets <- exacts$name2[-miss_inds]

	mtchs <- amatch(misses, targets, maxDist = Inf) # <- tweak distance calc here

	key <- data_frame(old = misses,
	new_name2 = targets[mtchs])

	# fill in new_name2 from the key and existing name2, then merge
	# DB1 info using new_name2
	disambig <- left_join(DB2, key, by = c('name2' = 'old')) %>%
	mutate(new_name2 = if_else(is.na(new_name2), name2, new_name2)) %>%
	left_join(DB1, by = c("new_name2" = "name1"))