schaunwheeler · December 20, 2015 12:39
diff --git a/linkRecords.r b/linkRecords.r
 # The MIT License (MIT)
 #
 # Copyright (c) 2012 Schaun Jacob Wheeler
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 linkRecords <- function(
  df_1,
  df_2,
  index,
  exclude,
  compare,
  block,
  compare_first_part = TRUE,
  block_trim = NULL,
  matching_threshold = 0.8,
  remove_duplicates = TRUE,
  sanity_checks = NULL,
  failsafe = NULL,
  similarity_measure = "levenshtein") {
  
  col_reference <- data.frame(
    "name" = c("index", "exclude", "compare", "block"),
    "order" = 1:4,
    stringsAsFactors = FALSE)
  
  # Create reference tables for first data frame
  df1_cols <- list(index[[1]], exclude[[1]], compare[[1]], block[[1]])
  df1_cols <- do.call("rbind", lapply(seq_along(df1_cols), function(i) {
    if(!is.null(df1_cols[[i]])) {
      out <- cbind(
        df1_cols[[i]], 
        paste0(col_reference$name[i], 1:length(df1_cols[[i]])))
    } else {
      out <- NULL
    }
    out
  }))
  
  colnames(df1_cols) <-  c("original", "reference")
  df1_cols <- as.data.frame(df1_cols, stringsAsFactors = FALSE)
  
  if(compare_first_part) {
    df1_insert <- df1_cols[grep("compare", df1_cols$reference),, drop = FALSE]
    df1_insert$reference <- paste(df1_insert$reference, "part", sep = "_")
    
    df1_cols <- rbind(df1_cols, df1_insert) 
  }
  
  # Create reference tables for second data frame
  df2_cols <- list(index[[2]], exclude[[2]], compare[[2]], block[[2]])
  df2_cols <- do.call("rbind", lapply(seq_along(df2_cols), function(i) {
    if(!is.null(df2_cols[[i]])) {
      out <- cbind(
        df2_cols[[i]], 
        paste0(col_reference$name[i], 1:length(df2_cols[[i]])))
    } else {
      out <- NULL
    }
    out
  }))
  
  colnames(df2_cols) <-  c("original", "reference")
  df2_cols <- as.data.frame(df2_cols, stringsAsFactors = FALSE)
  
  if(compare_first_part) {
    df2_insert <- df2_cols[grep("compare", df2_cols$reference),, drop = FALSE]
    df2_insert$reference <- paste(df2_insert$reference, "part", sep = "_")
    
    df2_cols <- rbind(df2_cols, df2_insert) 
  }
  
  # Create data frames for record linkage
  df1 <- df_1[, df1_cols$original]
  df1[] <- lapply(df1, tolower)
  df1[, grepl("_part$", df1_cols$reference)] <- lapply(
    df1[, grepl("_part$", df1_cols$reference), drop = FALSE], function(x) {
      gsub("^(\\w+)\\b.*", "\\1", x)
    })
  
  df2 <- df_2[, df2_cols$original]
  df2[] <- lapply(df2, tolower)
  df2[, grepl("_part$", df2_cols$reference)] <- lapply(
    df2[, grepl("_part$", df2_cols$reference), drop = FALSE], function(x) {
      gsub("^(\\w+)\\b.*", "\\1", x)
    })
  
  if(!is.null(block_trim)) {
    df1[, grepl("^block", df1_cols$reference)] <- lapply(
      df1[, grepl("^block", df1_cols$reference), drop = FALSE], function(x) {
        substring(x, block_trim[1], block_trim[2])
      })
    df2[, grepl("^block", df2_cols$reference)] <- lapply(
      df2[, grepl("^block", df2_cols$reference), drop = FALSE], function(x) {
        substring(x, block_trim[1], block_trim[2])
      })
  }
  
  df_matching_index <- match(
    gsub("^(index|exclude|block|compare).*", "\\1", df1_cols$reference), 
    gsub("^(index|exclude|block|compare).*", "\\1", col_reference$name)
  )
  df_matching_index <- order(df_matching_index, df1_cols$reference)
  df1 <- df1[,df_matching_index]
  df2 <- df2[,df_matching_index]
  
  colnames(df1) <- df1_cols$reference[df_matching_index]
  colnames(df2) <- df2_cols$reference[df_matching_index]
  
  exclude_cols <- grep("^(index|exclude)", colnames(df1))
  compare_cols <- grep("^compare", colnames(df1))
  block_cols <- grep("^block", colnames(df1))
  
  if(length(exclude_cols) == 0) {
    exclude_cols <- numeric(0)
  }

  if(length(compare_cols) == 0) {
    compare_cols <- numeric(0)
  }
  
  if(length(block_cols) == 0) {
    block_cols <- list()
  }
  
  results <- RLBigDataLinkage(
    dataset1 = df1,
    dataset2 = df2,
    exclude = exclude_cols,
    strcmp = compare_cols,
    strcmpfun = similarity_measure,
    blockfld = block_cols)
  
  results <- epiWeights(results)
  results <- epiClassify(results, matching_threshold)
  results <- getPairs(results, filter.link="link")
  results[] <- lapply(results, as.character)
  colnames(results) <- tolower(colnames(results))
  
  results$groupings <- rep(1:(sum(1:nrow(results) %% 3 == 0)), each = 3)
  results$groupings[1:nrow(results) %% 3 == 0] <- ""
  results <- results[results$groupings != "", ]
  results[results == ""] <- NA
  
  results$df1_rownum <- rep(NA, nrow(results))
  results$df1_rownum[1:nrow(results)%% 2 != 0] <- 
    results$id[1:nrow(results)%% 2 != 0]
  
  results$df2_rownum <- rep(NA, nrow(results))
  results$df2_rownum[1:nrow(results)%% 2 != 0] <- 
    results$id[1:nrow(results)%% 2 == 0]
  
  results$weight[1:nrow(results)%% 2 != 0] <- 
    results$weight[1:nrow(results)%% 2 == 0]
  
  results$df1_index1 <- rep(NA, nrow(results))
  results$df1_index1[1:nrow(results)%% 2 != 0] <- 
    results$index1[1:nrow(results)%% 2 != 0]
  
  results$df2_index1 <- rep(NA, nrow(results))
  results$df2_index1[1:nrow(results)%% 2 != 0] <- 
    results$index1[1:nrow(results)%% 2 == 0]
  
  results$df1_compare1 <- rep(NA, nrow(results))
  results$df1_compare1[1:nrow(results)%% 2 != 0] <- 
    results$compare1[1:nrow(results)%% 2 != 0]
  
  results$df2_compare1 <- rep(NA, nrow(results))
  results$df2_compare1[1:nrow(results)%% 2 != 0] <- 
    results$compare1[1:nrow(results)%% 2 == 0]
  
  if(!is.null(sanity_checks)) {
    
    for(x in sanity_checks) {
      
      results$df2_compare1[
        (grepl(x, results$df1_compare1, perl = TRUE) & 
            !grepl(x, results$df2_compare1, perl = TRUE)) |
          (!grepl(x, results$df1_compare1) & 
              grepl(x, results$df2_compare1))] <- NA
    }
  }
  
  results <- results[!is.na(results$df2_compare1),]
  results <- results[order(as.numeric(results$weight), decreasing = TRUE),]
  results <- results[order(as.numeric(results$id), decreasing = FALSE),]
  
  if(remove_duplicates) {
    results <- results[!duplicated(results$id),]
  }
  
  results <- results[, 
    c(grep("^df\\d_", colnames(results), value = TRUE), "weight")]
  results$df1_rownum <- as.numeric(results$df1_rownum)
  results$df2_rownum <- as.numeric(results$df2_rownum)
  results$weight <- as.numeric(results$weight)
  
  
  if(!is.null(failsafe)) {
    
    df1_failsafe <- df_1[, failsafe[[1]]]
    df2_failsafe <- df_2[, failsafe[[2]]]
    
    failsafe_match <- data.frame(
      lapply(df2_failsafe, function(y) {
      match(df1_failsafe, y, incomparables = NA)
    }),
      stringsAsFactors = FALSE)
   
    if(ncol(failsafe_match) > 1) {      
      for(i in ncol(failsafe_match):2) {
        
        failsafe_match[!is.na(failsafe_match[,i]), i - 1] <- 
          failsafe_match[!is.na(failsafe_match[,i]), i]
      }
    }
    
    failsafe_output <- data.frame(
      "df1_rownum" = 1:nrow(failsafe_match),
      "df2_rownum" = failsafe_match[,1],
      stringsAsFactors = FALSE)
    
    failsafe_output <- failsafe_output[!is.na(failsafe_output$df2_rownum),]
    
    failsafe_replace <- data.frame(
      "df1_rownum" = failsafe_output$df1_rownum,
      "df2_rownum" = failsafe_output$df2_rownum,
      "df1_index1" = df1$index1[failsafe_output$df1_rownum],
      "df2_index1" = df2$index1[failsafe_output$df2_rownum],
      "df1_compare1" = df1$compare1[failsafe_output$df1_rownum],
      "df2_compare1" = df2$compare1[failsafe_output$df2_rownum],
      "weight" = rep(NA, nrow(failsafe_output)),
      stringsAsFactors = FALSE)
      
    replace_ind <- match(failsafe_replace$df1_rownum, results$df1_rownum)
    
    results[na.omit(replace_ind),] <- failsafe_replace[!is.na(replace_ind),]
    
    results <- rbind(results, failsafe_replace[is.na(replace_ind),])
    
    results <- results[order(results$df1_rownum, decreasing = FALSE),]
  }
       
  rownames(results) <- NULL
  
  results
 }
	# The MIT License (MIT)
	#
	# Copyright (c) 2012 Schaun Jacob Wheeler
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	linkRecords <- function(
	df_1,
	df_2,
	index,
	exclude,
	compare,
	block,
	compare_first_part = TRUE,
	block_trim = NULL,
	matching_threshold = 0.8,
	remove_duplicates = TRUE,
	sanity_checks = NULL,
	failsafe = NULL,
	similarity_measure = "levenshtein") {

	col_reference <- data.frame(
	"name" = c("index", "exclude", "compare", "block"),
	"order" = 1:4,
	stringsAsFactors = FALSE)

	# Create reference tables for first data frame
	df1_cols <- list(index[[1]], exclude[[1]], compare[[1]], block[[1]])
	df1_cols <- do.call("rbind", lapply(seq_along(df1_cols), function(i) {
	if(!is.null(df1_cols[[i]])) {
	out <- cbind(
	df1_cols[[i]],
	paste0(col_reference$name[i], 1:length(df1_cols[[i]])))
	} else {
	out <- NULL
	}
	out
	}))

	colnames(df1_cols) <- c("original", "reference")
	df1_cols <- as.data.frame(df1_cols, stringsAsFactors = FALSE)

	if(compare_first_part) {
	df1_insert <- df1_cols[grep("compare", df1_cols$reference),, drop = FALSE]
	df1_insert$reference <- paste(df1_insert$reference, "part", sep = "_")

	df1_cols <- rbind(df1_cols, df1_insert)
	}

	# Create reference tables for second data frame
	df2_cols <- list(index[[2]], exclude[[2]], compare[[2]], block[[2]])
	df2_cols <- do.call("rbind", lapply(seq_along(df2_cols), function(i) {
	if(!is.null(df2_cols[[i]])) {
	out <- cbind(
	df2_cols[[i]],
	paste0(col_reference$name[i], 1:length(df2_cols[[i]])))
	} else {
	out <- NULL
	}
	out
	}))

	colnames(df2_cols) <- c("original", "reference")
	df2_cols <- as.data.frame(df2_cols, stringsAsFactors = FALSE)

	if(compare_first_part) {
	df2_insert <- df2_cols[grep("compare", df2_cols$reference),, drop = FALSE]
	df2_insert$reference <- paste(df2_insert$reference, "part", sep = "_")

	df2_cols <- rbind(df2_cols, df2_insert)
	}

	# Create data frames for record linkage
	df1 <- df_1[, df1_cols$original]
	df1[] <- lapply(df1, tolower)
	df1[, grepl("_part$", df1_cols$reference)] <- lapply(
	df1[, grepl("_part$", df1_cols$reference), drop = FALSE], function(x) {
	gsub("^(\\w+)\\b.*", "\\1", x)
	})

	df2 <- df_2[, df2_cols$original]
	df2[] <- lapply(df2, tolower)
	df2[, grepl("_part$", df2_cols$reference)] <- lapply(
	df2[, grepl("_part$", df2_cols$reference), drop = FALSE], function(x) {
	gsub("^(\\w+)\\b.*", "\\1", x)
	})

	if(!is.null(block_trim)) {
	df1[, grepl("^block", df1_cols$reference)] <- lapply(
	df1[, grepl("^block", df1_cols$reference), drop = FALSE], function(x) {
	substring(x, block_trim[1], block_trim[2])
	})
	df2[, grepl("^block", df2_cols$reference)] <- lapply(
	df2[, grepl("^block", df2_cols$reference), drop = FALSE], function(x) {
	substring(x, block_trim[1], block_trim[2])
	})
	}

	df_matching_index <- match(
	gsub("^(index\|exclude\|block\|compare).*", "\\1", df1_cols$reference),
	gsub("^(index\|exclude\|block\|compare).*", "\\1", col_reference$name)
	)
	df_matching_index <- order(df_matching_index, df1_cols$reference)
	df1 <- df1[,df_matching_index]
	df2 <- df2[,df_matching_index]

	colnames(df1) <- df1_cols$reference[df_matching_index]
	colnames(df2) <- df2_cols$reference[df_matching_index]

	exclude_cols <- grep("^(index\|exclude)", colnames(df1))
	compare_cols <- grep("^compare", colnames(df1))
	block_cols <- grep("^block", colnames(df1))

	if(length(exclude_cols) == 0) {
	exclude_cols <- numeric(0)
	}

	if(length(compare_cols) == 0) {
	compare_cols <- numeric(0)
	}

	if(length(block_cols) == 0) {
	block_cols <- list()
	}

	results <- RLBigDataLinkage(
	dataset1 = df1,
	dataset2 = df2,
	exclude = exclude_cols,
	strcmp = compare_cols,
	strcmpfun = similarity_measure,
	blockfld = block_cols)

	results <- epiWeights(results)
	results <- epiClassify(results, matching_threshold)
	results <- getPairs(results, filter.link="link")
	results[] <- lapply(results, as.character)
	colnames(results) <- tolower(colnames(results))

	results$groupings <- rep(1:(sum(1:nrow(results) %% 3 == 0)), each = 3)
	results$groupings[1:nrow(results) %% 3 == 0] <- ""
	results <- results[results$groupings != "", ]
	results[results == ""] <- NA

	results$df1_rownum <- rep(NA, nrow(results))
	results$df1_rownum[1:nrow(results)%% 2 != 0] <-
	results$id[1:nrow(results)%% 2 != 0]

	results$df2_rownum <- rep(NA, nrow(results))
	results$df2_rownum[1:nrow(results)%% 2 != 0] <-
	results$id[1:nrow(results)%% 2 == 0]

	results$weight[1:nrow(results)%% 2 != 0] <-
	results$weight[1:nrow(results)%% 2 == 0]

	results$df1_index1 <- rep(NA, nrow(results))
	results$df1_index1[1:nrow(results)%% 2 != 0] <-
	results$index1[1:nrow(results)%% 2 != 0]

	results$df2_index1 <- rep(NA, nrow(results))
	results$df2_index1[1:nrow(results)%% 2 != 0] <-
	results$index1[1:nrow(results)%% 2 == 0]

	results$df1_compare1 <- rep(NA, nrow(results))
	results$df1_compare1[1:nrow(results)%% 2 != 0] <-
	results$compare1[1:nrow(results)%% 2 != 0]

	results$df2_compare1 <- rep(NA, nrow(results))
	results$df2_compare1[1:nrow(results)%% 2 != 0] <-
	results$compare1[1:nrow(results)%% 2 == 0]

	if(!is.null(sanity_checks)) {

	for(x in sanity_checks) {

	results$df2_compare1[
	(grepl(x, results$df1_compare1, perl = TRUE) &
	!grepl(x, results$df2_compare1, perl = TRUE)) \|
	(!grepl(x, results$df1_compare1) &
	grepl(x, results$df2_compare1))] <- NA
	}
	}

	results <- results[!is.na(results$df2_compare1),]
	results <- results[order(as.numeric(results$weight), decreasing = TRUE),]
	results <- results[order(as.numeric(results$id), decreasing = FALSE),]

	if(remove_duplicates) {
	results <- results[!duplicated(results$id),]
	}

	results <- results[,
	c(grep("^df\\d_", colnames(results), value = TRUE), "weight")]
	results$df1_rownum <- as.numeric(results$df1_rownum)
	results$df2_rownum <- as.numeric(results$df2_rownum)
	results$weight <- as.numeric(results$weight)


	if(!is.null(failsafe)) {

	df1_failsafe <- df_1[, failsafe[[1]]]
	df2_failsafe <- df_2[, failsafe[[2]]]

	failsafe_match <- data.frame(
	lapply(df2_failsafe, function(y) {
	match(df1_failsafe, y, incomparables = NA)
	}),
	stringsAsFactors = FALSE)

	if(ncol(failsafe_match) > 1) {
	for(i in ncol(failsafe_match):2) {

	failsafe_match[!is.na(failsafe_match[,i]), i - 1] <-
	failsafe_match[!is.na(failsafe_match[,i]), i]
	}
	}

	failsafe_output <- data.frame(
	"df1_rownum" = 1:nrow(failsafe_match),
	"df2_rownum" = failsafe_match[,1],
	stringsAsFactors = FALSE)

	failsafe_output <- failsafe_output[!is.na(failsafe_output$df2_rownum),]

	failsafe_replace <- data.frame(
	"df1_rownum" = failsafe_output$df1_rownum,
	"df2_rownum" = failsafe_output$df2_rownum,
	"df1_index1" = df1$index1[failsafe_output$df1_rownum],
	"df2_index1" = df2$index1[failsafe_output$df2_rownum],
	"df1_compare1" = df1$compare1[failsafe_output$df1_rownum],
	"df2_compare1" = df2$compare1[failsafe_output$df2_rownum],
	"weight" = rep(NA, nrow(failsafe_output)),
	stringsAsFactors = FALSE)

	replace_ind <- match(failsafe_replace$df1_rownum, results$df1_rownum)

	results[na.omit(replace_ind),] <- failsafe_replace[!is.na(replace_ind),]

	results <- rbind(results, failsafe_replace[is.na(replace_ind),])

	results <- results[order(results$df1_rownum, decreasing = FALSE),]
	}

	rownames(results) <- NULL

	results
	}