Created
August 22, 2018 15:53
-
-
Save willtownes/81c92018b32d16a6bca9f651a00182ae to your computer and use it in GitHub Desktop.
Fuzzy Clustering Jaccard and Rand Indices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Fuzzy version of Jaccard and Rand Indices | |
#based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures" | |
L<-4 #number of clusters | |
N<-50 #number of objects | |
X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering | |
y<-apply(X,1,which.max) #hard cluster version of X | |
table(y) | |
Y<-model.matrix(~factor(y)-1) #hard cluster version of X | |
Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering | |
W<-t(rmultinom(N,1,rep(1/L,L))) #a different random hard clustering | |
w<-apply(W,1,which.max) | |
cluster_similarity<-function(P,Pstar,index=c("jaccard","rand")){ | |
#P,Pstar are fuzzy clusterings with objects in rows, clusters in cols (matrices) | |
#alternatively, P or Pstar can be vectors indicating hard clustering assignments | |
#if(is.vector(P) && is.vector(Pstar)){ | |
#both are hard clusterings | |
# return(clusteval::cluster_similarity(P,Pstar,similarity=index)) | |
#} | |
index<-match.arg(index) | |
if(is.vector(P)){ P<-model.matrix(~factor(P)-1) } | |
if(is.vector(Pstar)){ Pstar<-model.matrix(~factor(Pstar)-1) } | |
u<-1-dist(Pstar,method="manhattan")/2 | |
v<-1-dist(P,method="manhattan")/2 | |
a<-sum((1-abs(u-v))*u*v) | |
b<-sum(pmax(u-v,0)) | |
c<-sum(pmax(v-u,0)) | |
if(index=="jaccard"){ | |
if(a==0){ return(0) } #edge case where all clusters are singleton | |
return(a/(a+b+c)) | |
} else if(index=="rand"){ | |
d<-sum((1-abs(u-v))*(1-u*v)) | |
return((a+d)/(a+b+c+d)) | |
} | |
} | |
#check agreement with existing packages for hard clusterings | |
clusteval::cluster_similarity(w,y,"jaccard") | |
cluster_similarity(W,Y,"jaccard") | |
clusteval::cluster_similarity(w,y,"rand") | |
cluster_similarity(W,Y,"rand") | |
#check that similarity=1 when comparing something to itself | |
cluster_similarity(X,X)==1 | |
cluster_similarity(X,X,"rand")==1 | |
#check symmetry | |
cluster_similarity(X,Y)==cluster_similarity(Y,X) | |
#jaccard=0 when all singleton clusters | |
cluster_similarity(c(1,2,3),c(1,2,3))==0 | |
#check high value for hard clustering and very similar soft cl | |
cluster_similarity(X,Y) | |
cluster_similarity(X,Y,"rand") | |
#check low value for two random soft clusterings | |
cluster_similarity(X,Z) | |
cluster_similarity(X,Z,"rand") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment