Last active
August 29, 2015 14:28
-
-
Save trcook/d3d32dee3619ee40e839 to your computer and use it in GitHub Desktop.
This is a quick way to use levenshtein distance to match two sets of entries in a 'fuzzy' way. The key is to match the entry in one set with its nearest neighbor in the other set, measuring distance as 'edit distance'.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
my_match<-function(m,sp,sp_match='name_spelling',m_match='team',robust=T){ | |
require(RecordLinkage) | |
require(plyr) | |
matches<-ldply(m[,m_match],.fun=function(x){ | |
distances<-levenshteinDist(x,sp[,c(sp_match)]) | |
most_likely<-sp[which(distances==min(distances)),] | |
if(robust==T){ | |
if(length(most_likely[,1])>1){ | |
most_likely<-cbind(x,most_likely[],1,min(distances)) | |
}else{ | |
most_likely<-cbind(x,most_likely[],0,min(distances)) | |
} | |
}else{ | |
if(length(most_likely[,1])>1){ | |
most_likely<-cbind(x,most_likely[1,],1,min(distances)) | |
}else{ | |
most_likely<-cbind(x,most_likely[1,],0,min(distances)) | |
} | |
} | |
most_likely<-data.frame(most_likely) | |
names(most_likely)[1]<-"m_match" | |
names(most_likely)[length(most_likely[1,])-1]<-"fuzzy" | |
names(most_likely)[length(most_likely[1,])]<-"dist" | |
return(most_likely[]) | |
}) | |
return(matches) | |
} | |
# example: | |
m<-data.frame(team=c("apple","Peach","orange","watermelon")) | |
sp<-data.frame(spelling=c("ap","peac","orang","range","water melon","Water melon"),id=c(1:6)) | |
my_match(m,sp,sp_match='spelling',m_match='team',robust=T) # gets all matches for orange | |
my_match(m,sp,sp_match='spelling',m_match='team',robust=F) # only pulls first match, 'orang', for orange |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment