trcook · August 29, 2015 14:28
diff --git a/distance match.r b/distance match.r


 my_match<-function(m,sp,sp_match='name_spelling',m_match='team',robust=T){
 require(RecordLinkage)
 require(plyr)
 matches<-ldply(m[,m_match],.fun=function(x){
  distances<-levenshteinDist(x,sp[,c(sp_match)])
  most_likely<-sp[which(distances==min(distances)),]
  if(robust==T){
    if(length(most_likely[,1])>1){
      most_likely<-cbind(x,most_likely[],1,min(distances))
    }else{
      most_likely<-cbind(x,most_likely[],0,min(distances))
    }
  }else{
      if(length(most_likely[,1])>1){
        most_likely<-cbind(x,most_likely[1,],1,min(distances))
      }else{
        most_likely<-cbind(x,most_likely[1,],0,min(distances))
      }
    }
  most_likely<-data.frame(most_likely)
  names(most_likely)[1]<-"m_match"
  names(most_likely)[length(most_likely[1,])-1]<-"fuzzy"
  names(most_likely)[length(most_likely[1,])]<-"dist"
  

  return(most_likely[])
 })
 return(matches)
 }

 # example:

 m<-data.frame(team=c("apple","Peach","orange","watermelon"))
 sp<-data.frame(spelling=c("ap","peac","orang","range","water melon","Water melon"),id=c(1:6))
 my_match(m,sp,sp_match='spelling',m_match='team',robust=T) # gets all matches for orange
 my_match(m,sp,sp_match='spelling',m_match='team',robust=F) # only pulls first match, 'orang', for orange


	my_match<-function(m,sp,sp_match='name_spelling',m_match='team',robust=T){
	require(RecordLinkage)
	require(plyr)
	matches<-ldply(m[,m_match],.fun=function(x){
	distances<-levenshteinDist(x,sp[,c(sp_match)])
	most_likely<-sp[which(distances==min(distances)),]
	if(robust==T){
	if(length(most_likely[,1])>1){
	most_likely<-cbind(x,most_likely[],1,min(distances))
	}else{
	most_likely<-cbind(x,most_likely[],0,min(distances))
	}
	}else{
	if(length(most_likely[,1])>1){
	most_likely<-cbind(x,most_likely[1,],1,min(distances))
	}else{
	most_likely<-cbind(x,most_likely[1,],0,min(distances))
	}
	}
	most_likely<-data.frame(most_likely)
	names(most_likely)[1]<-"m_match"
	names(most_likely)[length(most_likely[1,])-1]<-"fuzzy"
	names(most_likely)[length(most_likely[1,])]<-"dist"


	return(most_likely[])
	})
	return(matches)
	}

	# example:

	m<-data.frame(team=c("apple","Peach","orange","watermelon"))
	sp<-data.frame(spelling=c("ap","peac","orang","range","water melon","Water melon"),id=c(1:6))
	my_match(m,sp,sp_match='spelling',m_match='team',robust=T) # gets all matches for orange
	my_match(m,sp,sp_match='spelling',m_match='team',robust=F) # only pulls first match, 'orang', for orange