|
#!/usr/bin/python |
|
|
|
# Load required modules |
|
import pandas as pd |
|
import scipy.spatial |
|
import scipy.cluster |
|
import numpy as np |
|
import json |
|
import matplotlib.pyplot as plt |
|
|
|
# Example data: gene expression |
|
geneExp = {'genes' : ['a', 'b', 'c', 'd', 'e', 'f'], |
|
'exp1': [-2.2, 5.6, 0.9, -0.23, -3, 0.1], |
|
'exp2': [5.4, -0.5, 2.33, 3.1, 4.1, -3.2] |
|
} |
|
df = pd.DataFrame( geneExp ) |
|
|
|
# Determine distances (default is Euclidean) |
|
dataMatrix = np.array( df[['exp1', 'exp2']] ) |
|
distMat = scipy.spatial.distance.pdist( dataMatrix ) |
|
|
|
# Cluster hierarchicaly using scipy |
|
clusters = scipy.cluster.hierarchy.linkage(distMat, method='single') |
|
T = scipy.cluster.hierarchy.to_tree( clusters , rd=False ) |
|
|
|
# Create dictionary for labeling nodes by their IDs |
|
labels = list(df.genes) |
|
id2name = dict(zip(range(len(labels)), labels)) |
|
|
|
# Draw dendrogram using matplotlib to scipy-dendrogram.pdf |
|
scipy.cluster.hierarchy.dendrogram(clusters, labels=labels, orientation='right') |
|
plt.savefig("scipy-dendrogram.png") |
|
|
|
# Create a nested dictionary from the ClusterNode's returned by SciPy |
|
def add_node(node, parent ): |
|
# First create the new node and append it to its parent's children |
|
newNode = dict( node_id=node.id, children=[] ) |
|
parent["children"].append( newNode ) |
|
|
|
# Recursively add the current node's children |
|
if node.left: add_node( node.left, newNode ) |
|
if node.right: add_node( node.right, newNode ) |
|
|
|
# Initialize nested dictionary for d3, then recursively iterate through tree |
|
d3Dendro = dict(children=[], name="Root1") |
|
add_node( T, d3Dendro ) |
|
|
|
# Label each node with the names of each leaf in its subtree |
|
def label_tree( n ): |
|
# If the node is a leaf, then we have its name |
|
if len(n["children"]) == 0: |
|
leafNames = [ id2name[n["node_id"]] ] |
|
|
|
# If not, flatten all the leaves in the node's subtree |
|
else: |
|
leafNames = reduce(lambda ls, c: ls + label_tree(c), n["children"], []) |
|
|
|
# Delete the node id since we don't need it anymore and |
|
# it makes for cleaner JSON |
|
del n["node_id"] |
|
|
|
# Labeling convention: "-"-separated leaf names |
|
n["name"] = name = "-".join(sorted(map(str, leafNames))) |
|
|
|
return leafNames |
|
|
|
label_tree( d3Dendro["children"][0] ) |
|
|
|
# Output to JSON |
|
json.dump(d3Dendro, open("d3-dendrogram.json", "w"), sort_keys=True, indent=4) |
Thanks for this awesome example.
I tried it myself. But for very large tree,
add_node
might throw:RuntimeError: maximum recursion depth exceeded