Created
April 3, 2012 05:56
-
-
Save enjoylife/2289625 to your computer and use it in GitHub Desktop.
Python implementation of SCAN: A Structural Clustering Algorithm for Networks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
SCAN: A Structural Clustering Algorithm for Networks | |
As described in http://ualr.edu/nxyuruk/publications/kdd07.pdf | |
""" | |
from collections import deque | |
import numpy as np | |
from scipy.sparse import csr_matrix | |
def struct_similarity(vcols, wcols): | |
""" Compute the similartiy normalized on geometric mean of vertices""" | |
# count the similar rows for unioning edges | |
count = [index for index in wcols if (index in vcols)] | |
# geomean | |
#need to account for vertex itself, add 2(1 for each vertex) | |
ans = (len(count) +2) / (((vcols.size+1)*(wcols.size+1)) ** .5) | |
return ans | |
def neighborhood(G, vertex_v, eps): | |
""" Returns the neighbors, as well as all the connected vertices """ | |
N = deque() | |
vcols = vertex_v.tocoo().col | |
#check the similarity for each connected vertex | |
for index in vcols: | |
wcols = G[index,:].tocoo().col | |
if struct_similarity(vcols, wcols)> eps: | |
N.append(index) | |
return N, vcols | |
def scan(G, eps =0.7, mu=2): | |
""" | |
Vertex Structure = sum of row + itself(1) | |
Structural Similarity is the geometric mean of the 2Vertex size of structure | |
""" | |
c = 0 | |
v = G.shape[0] | |
# All vertices are labeled as unclassified(-1) | |
vertex_labels = -np.ones(v) | |
# start with a neg core(every new core we incr by 1) | |
cluster_id = -1 | |
for vertex in xrange(v): | |
N ,vcols = neighborhood(G, G[vertex,:],eps) | |
# must include vertex itself | |
N.appendleft(vertex) | |
if len(N) >= mu: | |
#print "we have a cluster at: %d ,with length %d " % (vertex, len(N)) | |
# gen a new cluster id (0 indexed) | |
cluster_id +=1 | |
while N: | |
y = N.pop() | |
R , ycols = neighborhood(G, G[y,:], eps) | |
# include itself | |
R.appendleft(y) | |
# (struct reachable) check core and if y is connected to vertex | |
if len(R) >= mu and y in vcols: | |
#print "we have a structure Reachable at: %d ,with length %d " % (y, len(R)) | |
while R: | |
r = R.pop() | |
label = vertex_labels[r] | |
# if unclassified or non-member | |
if (label == -1) or (label==0): | |
vertex_labels[r] = cluster_id | |
# unclassified ?? | |
if label == -1: | |
N.appendleft(r) | |
else: | |
vertex_labels[vertex] = 0 | |
#classify non-members | |
for index in np.where(vertex_labels ==0)[0]: | |
ncols= G[index,:].tocoo().col | |
if len(ncols) >=2: | |
## mark as a hub | |
vertex_labels[index] = -2 | |
continue | |
else: | |
## mark as outlier | |
vertex_labels[index] = -3 | |
continue | |
return vertex_labels | |
if __name__=='__main__': | |
# Based on Example from paper | |
rows = [0,0,0,0,1,1,1,2,2,2,3,3,3,3,4,4,4,4, | |
5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,8,8,8, | |
9,9,9,9,10,10,10,10,11,11,11,11, | |
12,12,12,12,12,13] | |
cols = [1,4,5,6,0,5,2,1,5,3,2,4,5,6,0,3,5,6, | |
0,1,2,3,4,4,0,3,7,11,10,6,11,12,8,7, | |
12,9,8,12,10,13,9,12,11,6,7,12,10,6, | |
7,8,9,10,11,9] | |
data = np.ones(len(rows)) | |
G =csr_matrix((data,(rows,cols)),shape=(14,14)) | |
#print G.todense() | |
#print neighborhood(G, G[0,:],.4 ) | |
print scan(G,.7, 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
what is the licence of this? would you be willing to release it on a bsd licence so I can use it in @scikit-multilearn?