#Fuzzy version of Jaccard and Rand Indices #based on Suleman: "Assessing a Fuzzy Extension of Rand Index and Related Measures" L<-4 #number of clusters N<-50 #number of objects X<-gtools::rdirichlet(N,rep(.01,L)) #NxL soft random clustering y<-apply(X,1,which.max) #hard cluster version of X table(y) Y<-model.matrix(~factor(y)-1) #hard cluster version of X Z<-matrix(1/L,nrow=N,ncol=L) #perfect uncertainty soft clustering W<-t(rmultinom(N,1,rep(1/L,L))) #a different random hard clustering w<-apply(W,1,which.max) cluster_similarity<-function(P,Pstar,index=c("jaccard","rand")){ #P,Pstar are fuzzy clusterings with objects in rows, clusters in cols (matrices) #alternatively, P or Pstar can be vectors indicating hard clustering assignments #if(is.vector(P) && is.vector(Pstar)){ #both are hard clusterings # return(clusteval::cluster_similarity(P,Pstar,similarity=index)) #} index<-match.arg(index) if(is.vector(P)){ P<-model.matrix(~factor(P)-1) } if(is.vector(Pstar)){ Pstar<-model.matrix(~factor(Pstar)-1) } u<-1-dist(Pstar,method="manhattan")/2 v<-1-dist(P,method="manhattan")/2 a<-sum((1-abs(u-v))*u*v) b<-sum(pmax(u-v,0)) c<-sum(pmax(v-u,0)) if(index=="jaccard"){ if(a==0){ return(0) } #edge case where all clusters are singleton return(a/(a+b+c)) } else if(index=="rand"){ d<-sum((1-abs(u-v))*(1-u*v)) return((a+d)/(a+b+c+d)) } } #check agreement with existing packages for hard clusterings clusteval::cluster_similarity(w,y,"jaccard") cluster_similarity(W,Y,"jaccard") clusteval::cluster_similarity(w,y,"rand") cluster_similarity(W,Y,"rand") #check that similarity=1 when comparing something to itself cluster_similarity(X,X)==1 cluster_similarity(X,X,"rand")==1 #check symmetry cluster_similarity(X,Y)==cluster_similarity(Y,X) #jaccard=0 when all singleton clusters cluster_similarity(c(1,2,3),c(1,2,3))==0 #check high value for hard clustering and very similar soft cl cluster_similarity(X,Y) cluster_similarity(X,Y,"rand") #check low value for two random soft clusterings cluster_similarity(X,Z) cluster_similarity(X,Z,"rand")