Skip to content

Instantly share code, notes, and snippets.

@schaunwheeler
Last active December 20, 2015 12:39
Show Gist options
  • Save schaunwheeler/6133217 to your computer and use it in GitHub Desktop.
Save schaunwheeler/6133217 to your computer and use it in GitHub Desktop.
A wrapper for the `RLBigDataLinkage` function in the `RecordLinkage` package.
# The MIT License (MIT)
#
# Copyright (c) 2012 Schaun Jacob Wheeler
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
linkRecords <- function(
df_1,
df_2,
index,
exclude,
compare,
block,
compare_first_part = TRUE,
block_trim = NULL,
matching_threshold = 0.8,
remove_duplicates = TRUE,
sanity_checks = NULL,
failsafe = NULL,
similarity_measure = "levenshtein") {
col_reference <- data.frame(
"name" = c("index", "exclude", "compare", "block"),
"order" = 1:4,
stringsAsFactors = FALSE)
# Create reference tables for first data frame
df1_cols <- list(index[[1]], exclude[[1]], compare[[1]], block[[1]])
df1_cols <- do.call("rbind", lapply(seq_along(df1_cols), function(i) {
if(!is.null(df1_cols[[i]])) {
out <- cbind(
df1_cols[[i]],
paste0(col_reference$name[i], 1:length(df1_cols[[i]])))
} else {
out <- NULL
}
out
}))
colnames(df1_cols) <- c("original", "reference")
df1_cols <- as.data.frame(df1_cols, stringsAsFactors = FALSE)
if(compare_first_part) {
df1_insert <- df1_cols[grep("compare", df1_cols$reference),, drop = FALSE]
df1_insert$reference <- paste(df1_insert$reference, "part", sep = "_")
df1_cols <- rbind(df1_cols, df1_insert)
}
# Create reference tables for second data frame
df2_cols <- list(index[[2]], exclude[[2]], compare[[2]], block[[2]])
df2_cols <- do.call("rbind", lapply(seq_along(df2_cols), function(i) {
if(!is.null(df2_cols[[i]])) {
out <- cbind(
df2_cols[[i]],
paste0(col_reference$name[i], 1:length(df2_cols[[i]])))
} else {
out <- NULL
}
out
}))
colnames(df2_cols) <- c("original", "reference")
df2_cols <- as.data.frame(df2_cols, stringsAsFactors = FALSE)
if(compare_first_part) {
df2_insert <- df2_cols[grep("compare", df2_cols$reference),, drop = FALSE]
df2_insert$reference <- paste(df2_insert$reference, "part", sep = "_")
df2_cols <- rbind(df2_cols, df2_insert)
}
# Create data frames for record linkage
df1 <- df_1[, df1_cols$original]
df1[] <- lapply(df1, tolower)
df1[, grepl("_part$", df1_cols$reference)] <- lapply(
df1[, grepl("_part$", df1_cols$reference), drop = FALSE], function(x) {
gsub("^(\\w+)\\b.*", "\\1", x)
})
df2 <- df_2[, df2_cols$original]
df2[] <- lapply(df2, tolower)
df2[, grepl("_part$", df2_cols$reference)] <- lapply(
df2[, grepl("_part$", df2_cols$reference), drop = FALSE], function(x) {
gsub("^(\\w+)\\b.*", "\\1", x)
})
if(!is.null(block_trim)) {
df1[, grepl("^block", df1_cols$reference)] <- lapply(
df1[, grepl("^block", df1_cols$reference), drop = FALSE], function(x) {
substring(x, block_trim[1], block_trim[2])
})
df2[, grepl("^block", df2_cols$reference)] <- lapply(
df2[, grepl("^block", df2_cols$reference), drop = FALSE], function(x) {
substring(x, block_trim[1], block_trim[2])
})
}
df_matching_index <- match(
gsub("^(index|exclude|block|compare).*", "\\1", df1_cols$reference),
gsub("^(index|exclude|block|compare).*", "\\1", col_reference$name)
)
df_matching_index <- order(df_matching_index, df1_cols$reference)
df1 <- df1[,df_matching_index]
df2 <- df2[,df_matching_index]
colnames(df1) <- df1_cols$reference[df_matching_index]
colnames(df2) <- df2_cols$reference[df_matching_index]
exclude_cols <- grep("^(index|exclude)", colnames(df1))
compare_cols <- grep("^compare", colnames(df1))
block_cols <- grep("^block", colnames(df1))
if(length(exclude_cols) == 0) {
exclude_cols <- numeric(0)
}
if(length(compare_cols) == 0) {
compare_cols <- numeric(0)
}
if(length(block_cols) == 0) {
block_cols <- list()
}
results <- RLBigDataLinkage(
dataset1 = df1,
dataset2 = df2,
exclude = exclude_cols,
strcmp = compare_cols,
strcmpfun = similarity_measure,
blockfld = block_cols)
results <- epiWeights(results)
results <- epiClassify(results, matching_threshold)
results <- getPairs(results, filter.link="link")
results[] <- lapply(results, as.character)
colnames(results) <- tolower(colnames(results))
results$groupings <- rep(1:(sum(1:nrow(results) %% 3 == 0)), each = 3)
results$groupings[1:nrow(results) %% 3 == 0] <- ""
results <- results[results$groupings != "", ]
results[results == ""] <- NA
results$df1_rownum <- rep(NA, nrow(results))
results$df1_rownum[1:nrow(results)%% 2 != 0] <-
results$id[1:nrow(results)%% 2 != 0]
results$df2_rownum <- rep(NA, nrow(results))
results$df2_rownum[1:nrow(results)%% 2 != 0] <-
results$id[1:nrow(results)%% 2 == 0]
results$weight[1:nrow(results)%% 2 != 0] <-
results$weight[1:nrow(results)%% 2 == 0]
results$df1_index1 <- rep(NA, nrow(results))
results$df1_index1[1:nrow(results)%% 2 != 0] <-
results$index1[1:nrow(results)%% 2 != 0]
results$df2_index1 <- rep(NA, nrow(results))
results$df2_index1[1:nrow(results)%% 2 != 0] <-
results$index1[1:nrow(results)%% 2 == 0]
results$df1_compare1 <- rep(NA, nrow(results))
results$df1_compare1[1:nrow(results)%% 2 != 0] <-
results$compare1[1:nrow(results)%% 2 != 0]
results$df2_compare1 <- rep(NA, nrow(results))
results$df2_compare1[1:nrow(results)%% 2 != 0] <-
results$compare1[1:nrow(results)%% 2 == 0]
if(!is.null(sanity_checks)) {
for(x in sanity_checks) {
results$df2_compare1[
(grepl(x, results$df1_compare1, perl = TRUE) &
!grepl(x, results$df2_compare1, perl = TRUE)) |
(!grepl(x, results$df1_compare1) &
grepl(x, results$df2_compare1))] <- NA
}
}
results <- results[!is.na(results$df2_compare1),]
results <- results[order(as.numeric(results$weight), decreasing = TRUE),]
results <- results[order(as.numeric(results$id), decreasing = FALSE),]
if(remove_duplicates) {
results <- results[!duplicated(results$id),]
}
results <- results[,
c(grep("^df\\d_", colnames(results), value = TRUE), "weight")]
results$df1_rownum <- as.numeric(results$df1_rownum)
results$df2_rownum <- as.numeric(results$df2_rownum)
results$weight <- as.numeric(results$weight)
if(!is.null(failsafe)) {
df1_failsafe <- df_1[, failsafe[[1]]]
df2_failsafe <- df_2[, failsafe[[2]]]
failsafe_match <- data.frame(
lapply(df2_failsafe, function(y) {
match(df1_failsafe, y, incomparables = NA)
}),
stringsAsFactors = FALSE)
if(ncol(failsafe_match) > 1) {
for(i in ncol(failsafe_match):2) {
failsafe_match[!is.na(failsafe_match[,i]), i - 1] <-
failsafe_match[!is.na(failsafe_match[,i]), i]
}
}
failsafe_output <- data.frame(
"df1_rownum" = 1:nrow(failsafe_match),
"df2_rownum" = failsafe_match[,1],
stringsAsFactors = FALSE)
failsafe_output <- failsafe_output[!is.na(failsafe_output$df2_rownum),]
failsafe_replace <- data.frame(
"df1_rownum" = failsafe_output$df1_rownum,
"df2_rownum" = failsafe_output$df2_rownum,
"df1_index1" = df1$index1[failsafe_output$df1_rownum],
"df2_index1" = df2$index1[failsafe_output$df2_rownum],
"df1_compare1" = df1$compare1[failsafe_output$df1_rownum],
"df2_compare1" = df2$compare1[failsafe_output$df2_rownum],
"weight" = rep(NA, nrow(failsafe_output)),
stringsAsFactors = FALSE)
replace_ind <- match(failsafe_replace$df1_rownum, results$df1_rownum)
results[na.omit(replace_ind),] <- failsafe_replace[!is.na(replace_ind),]
results <- rbind(results, failsafe_replace[is.na(replace_ind),])
results <- results[order(results$df1_rownum, decreasing = FALSE),]
}
rownames(results) <- NULL
results
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment