cjdd3b · November 26, 2020 15:52 · giuliacassara · Nov 26, 2020
diff --git a/graph-cluster.py b/graph-cluster.py
 '''
 graph-cluster.py

 Some notes for doing graph clustering in a couple different ways: simple spectral
 partitioning based on the Fiedler vector, and a density-based clustering using DBSCAN.

 Why might this be useful? I'm using it to identify weakly connected (and therefore
 probably false) graph components in my campaign finance standardization workflow, the
 basic idea of which is here: https://github.com/cjdd3b/fec-standardizer/wiki

 But it is also one way of addressing the problem of community detection in graphs. If you
 want to, say, find subtle cliques among members of Congress based on their voting records,
 this would be one way to acheive that.

 Some useful links:
 http://en.wikipedia.org/wiki/Graph_partition
 http://en.wikipedia.org/wiki/Algebraic_connectivity
 http://snap.stanford.edu/class/cs246-2012/slides/11-graphs.pdf
 '''

 import networkx as nx
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.cluster import DBSCAN

 if __name__ == '__main__':
    # Create a simple graph that looks like this:
    # http://en.wikipedia.org/wiki/Algebraic_connectivity#mediaviewer/File:6n-graf.svg
    G = nx.Graph()
    G.add_nodes_from(xrange(1,7))
    G.add_edges_from([(1,5), (1,2), (5,2), (4,5), (4,3), (3,2), (4,6)])

    # Export to graphml for verification
    nx.write_graphml(G, 'clustering-exercise.graphml')

    # Conveniently, networkx has a method for finding the Laplacian
    laplacian = nx.laplacian_matrix(G)

    # Use numpy to compute the Fiedler vector, which corresponds to the
    # second smallest eigenvalue of the Laplacian
    w, v = np.linalg.eig(laplacian.todense())
    algebraic_connectivity = w[1] # Neat measure of how tight the graph is
    fiedler_vector = v[:,1].T

    # NOTE: Apparently nx also does this now
    # fiedler_vector = [nx.fiedler_vector(G)]

    # If we make our basic spectral clustering cut, we split the graph
    # in two at the origin, like this.
    plt.plot(fiedler_vector, xrange(len(fiedler_vector)), 'ro')
    plt.plot([(0, 0), (-1, 1)])
    plt.axis([-1, 1, 0, 0])
    plt.show()

    # That results in tightly connected nodes 1,2,3 and 5 forming one cluster and more
    # sparsely connected nodes 4 and 6 forming the other.

    # But we can also try to cluster the nodes differently. We'll try to do that along
    # the single dimension Fiedler vector provides, except basing the clusters on a
    # density of 0.15 rather than a simple cut at the origin.
    db = DBSCAN(eps=0.15, min_samples=1).fit(fiedler_vector.T)

    # We won't plot this because it's a pain, but it results in four clusters:
    # points 1, 2 and 5 in one cluster, then points 4, 3 and 6 in separate clusters
    for k in set(db.labels_):
        class_members = [index[0] for index in np.argwhere(db.labels_ == k)]
        for index in class_members:
            print 'Cluster: %s, Point %s: %s' % (int(k), index + 1, fiedler_vector.T[index])
	'''
	graph-cluster.py

	Some notes for doing graph clustering in a couple different ways: simple spectral
	partitioning based on the Fiedler vector, and a density-based clustering using DBSCAN.

	Why might this be useful? I'm using it to identify weakly connected (and therefore
	probably false) graph components in my campaign finance standardization workflow, the
	basic idea of which is here: https://github.com/cjdd3b/fec-standardizer/wiki

	But it is also one way of addressing the problem of community detection in graphs. If you
	want to, say, find subtle cliques among members of Congress based on their voting records,
	this would be one way to acheive that.

	Some useful links:
	http://en.wikipedia.org/wiki/Graph_partition
	http://en.wikipedia.org/wiki/Algebraic_connectivity
	http://snap.stanford.edu/class/cs246-2012/slides/11-graphs.pdf
	'''

	import networkx as nx
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.cluster import DBSCAN

	if __name__ == '__main__':
	# Create a simple graph that looks like this:
	# http://en.wikipedia.org/wiki/Algebraic_connectivity#mediaviewer/File:6n-graf.svg
	G = nx.Graph()
	G.add_nodes_from(xrange(1,7))
	G.add_edges_from([(1,5), (1,2), (5,2), (4,5), (4,3), (3,2), (4,6)])

	# Export to graphml for verification
	nx.write_graphml(G, 'clustering-exercise.graphml')

	# Conveniently, networkx has a method for finding the Laplacian
	laplacian = nx.laplacian_matrix(G)

	# Use numpy to compute the Fiedler vector, which corresponds to the
	# second smallest eigenvalue of the Laplacian
	w, v = np.linalg.eig(laplacian.todense())
	algebraic_connectivity = w[1] # Neat measure of how tight the graph is
	fiedler_vector = v[:,1].T

	# NOTE: Apparently nx also does this now
	# fiedler_vector = [nx.fiedler_vector(G)]

	# If we make our basic spectral clustering cut, we split the graph
	# in two at the origin, like this.
	plt.plot(fiedler_vector, xrange(len(fiedler_vector)), 'ro')
	plt.plot([(0, 0), (-1, 1)])
	plt.axis([-1, 1, 0, 0])
	plt.show()

	# That results in tightly connected nodes 1,2,3 and 5 forming one cluster and more
	# sparsely connected nodes 4 and 6 forming the other.

	# But we can also try to cluster the nodes differently. We'll try to do that along
	# the single dimension Fiedler vector provides, except basing the clusters on a
	# density of 0.15 rather than a simple cut at the origin.
	db = DBSCAN(eps=0.15, min_samples=1).fit(fiedler_vector.T)

	# We won't plot this because it's a pain, but it results in four clusters:
	# points 1, 2 and 5 in one cluster, then points 4, 3 and 6 in separate clusters
	for k in set(db.labels_):
	class_members = [index[0] for index in np.argwhere(db.labels_ == k)]
	for index in class_members:
	print 'Cluster: %s, Point %s: %s' % (int(k), index + 1, fiedler_vector.T[index])