Created
August 29, 2017 19:48
-
-
Save conjugateprior/6a38f440bbd624ee04969730a9922e61 to your computer and use it in GitHub Desktop.
Messy merge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(stringdist) # for approximate matching with 'amatch' | |
DB1 <- data_frame(name1 = c("John Q. Smith", "Sajeet Rau", | |
"Federico K. Giordino"), | |
judge_variable = c(1, 2, 3)) | |
DB2 <- data_frame(name2 = c("John Smith", "John Q. Smith", | |
"F. K. Giordino","Federico K. Giordino"), | |
case_variable = c(1, 1, 1, 2)) | |
# left join to get exact matches (there's probably an easier way) | |
exacts <- left_join(DB2, DB1, by = c('name2' = 'name1')) | |
miss_inds <- which(is.na(exacts$judge_variable)) | |
misses <- exacts$name2[miss_inds] | |
targets <- exacts$name2[-miss_inds] | |
mtchs <- amatch(misses, targets, maxDist = Inf) # <- tweak distance calc here | |
key <- data_frame(old = misses, | |
new_name2 = targets[mtchs]) | |
# fill in new_name2 from the key and existing name2, then merge | |
# DB1 info using new_name2 | |
disambig <- left_join(DB2, key, by = c('name2' = 'old')) %>% | |
mutate(new_name2 = if_else(is.na(new_name2), name2, new_name2)) %>% | |
left_join(DB1, by = c("new_name2" = "name1")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment