sfirke · October 19, 2016 03:10 · sfirke · Oct 19, 2016
diff --git a/gistfile1.txt b/gistfile1.txt

 library(pacman)
 p_load(fuzzyjoin, dplyr)
 # returns clusters of records that almost match
 get_fuzzy_dupes <- function(x, max_dist = 2){
  result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance")
  result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates
  result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it.  From http://stackoverflow.com/a/9028416
  result <- result[!duplicated(result), ]
  as_data_frame(result) %>%
    select(instance1 = V2, instance2 = V3, distance = V1) %>%
    arrange(instance1) %>%
    assign_clusters
 }


 # Assigns near-match duplicates into clusters, for easier cleaning
 # Helper function called by get_fuzzy_dupes
 assign_clusters <- function(dat){
  # go down rowwise - if either has a match in a previous cluster, assign to that cluster, otherwise new cluster
  dat$cluster <- numeric(length(nrow(dat)))
  dat$cluster[1] <- dat$instance1[1]
  for(i in 2:nrow(dat)){
    if(dat[i, "instance1"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)]) |
       dat[i, "instance2"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)])){
      dat$cluster[i] <- dat$cluster[min(which(dat[["instance1"]][i] == dat[["instance1"]][1:(i-1)] |
                                            dat[["instance1"]][i] == dat[["instance2"]][1:(i-1)] |
                                            dat[["instance2"]][i] == dat[["instance1"]][1:(i-1)] |
                                            dat[["instance2"]][i] == dat[["instance2"]][1:(i-1)]
      ))
      ]
    } else{
      dat$cluster[i] <- dat$instance1[i]
    }
  }
  dat
 }

 # Create a 1-vector df to play with
 dat <- mtcars %>%
  transmute(cars = row.names(.))

 # Examples
 get_fuzzy_dupes(dat, 2)
 get_fuzzy_dupes(dat, 1)

	library(pacman)
	p_load(fuzzyjoin, dplyr)
	# returns clusters of records that almost match
	get_fuzzy_dupes <- function(x, max_dist = 2){
	result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance")
	result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates
	result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it. From http://stackoverflow.com/a/9028416
	result <- result[!duplicated(result), ]
	as_data_frame(result) %>%
	select(instance1 = V2, instance2 = V3, distance = V1) %>%
	arrange(instance1) %>%
	assign_clusters
	}


	# Assigns near-match duplicates into clusters, for easier cleaning
	# Helper function called by get_fuzzy_dupes
	assign_clusters <- function(dat){
	# go down rowwise - if either has a match in a previous cluster, assign to that cluster, otherwise new cluster
	dat$cluster <- numeric(length(nrow(dat)))
	dat$cluster[1] <- dat$instance1[1]
	for(i in 2:nrow(dat)){
	if(dat[i, "instance1"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)]) \|
	dat[i, "instance2"] %in% c(dat[["instance1"]][1:(i-1)], dat[["instance2"]][1:(i-1)])){
	dat$cluster[i] <- dat$cluster[min(which(dat[["instance1"]][i] == dat[["instance1"]][1:(i-1)] \|
	dat[["instance1"]][i] == dat[["instance2"]][1:(i-1)] \|
	dat[["instance2"]][i] == dat[["instance1"]][1:(i-1)] \|
	dat[["instance2"]][i] == dat[["instance2"]][1:(i-1)]
	))
	]
	} else{
	dat$cluster[i] <- dat$instance1[i]
	}
	}
	dat
	}

	# Create a 1-vector df to play with
	dat <- mtcars %>%
	transmute(cars = row.names(.))

	# Examples
	get_fuzzy_dupes(dat, 2)
	get_fuzzy_dupes(dat, 1)