Created
May 14, 2012 18:32
-
-
Save foota/2695542 to your computer and use it in GitHub Desktop.
Hierarchical clustering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clusters_tbb.cpp | |
// hierarchical clustering using the TBB library | |
// by nox, 2009.07.02 | |
// | |
// Linux - Intel C++ | |
// icpc clusters_tbb.cpp -O3 -ltbb -o clusters_tbb | |
// | |
// Linux - GCC | |
// g++ clusters_tbb.cpp -O3 -ltbb -o clusters_tbb | |
// | |
// Windows - Microsoft Visual Studio 2008 | |
// cl clusters_tbb.cpp /EHsc /Ox /MD | |
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <vector> | |
#include <algorithm> | |
#include <utility> | |
#include <cmath> | |
#include "tbb/task_scheduler_init.h" | |
#include "tbb/blocked_range.h" | |
#include "tbb/parallel_for.h" | |
#include "tbb/parallel_reduce.h" | |
#include "tbb/parallel_sort.h" | |
using namespace std; | |
using namespace tbb; | |
double calc_distance(const vector<double>& vec1, const vector<double>& vec2) | |
{ | |
vector<double>::const_iterator p, q; | |
double s = 0.0; | |
for (p = vec1.begin(), q = vec2.begin(); p != vec1.end(); ++p, ++q) | |
s += (*p - *q) * (*p - *q); | |
return sqrt(s); | |
} | |
static vector<int> table_i; | |
struct bicluster { | |
bicluster* left; | |
bicluster* right; | |
int id; | |
vector<int> vec_ids; | |
double distance; | |
bicluster(bicluster* left_, bicluster* right_, int id_, vector<int> vec_ids_, double distance_) : | |
left(left_), right(right_), id(id_), vec_ids(vec_ids_), distance(distance_) { } | |
}; | |
typedef vector<bicluster*>::iterator vec_t; | |
typedef pair<vec_t, vec_t> pair_t; | |
typedef pair<int, int> distance_pair_t; | |
typedef vector<double> distances_t; | |
class CompleteLink { | |
public: | |
double dist; | |
const vector<int>& clust; | |
const distances_t& distances; | |
CompleteLink(double d_, const vector<int>& clust_, const distances_t& distances_) | |
: dist(d_), clust(clust_), distances(distances_) { } | |
CompleteLink(const CompleteLink& cl, split) : dist(cl.dist), clust(cl.clust), distances(cl.distances) { } | |
void operator()(const blocked_range<vector<int>::const_iterator>& r) { | |
for (vector<int>::const_iterator p = r.begin(); p != r.end(); ++p) { | |
for (vector<int>::const_iterator q = clust.begin(); q != clust.end(); ++q) { | |
int i = min(*p, *q); | |
int j = max(*p, *q); | |
dist = max(distances[table_i[i] + j], dist); | |
} | |
} | |
} | |
void join(const CompleteLink& cl) { | |
dist = max(dist, cl.dist); | |
} | |
}; | |
double calc_cluster_distance(const vector<int>& clust_i, const vector<int>& clust_j, const distances_t& distances) | |
{ | |
CompleteLink cl(0.0, clust_j, distances); | |
parallel_reduce(blocked_range<vector<int>::const_iterator>(clust_i.begin(), clust_i.end(), 100), cl); | |
return cl.dist; | |
} | |
class ReduceHCluster { | |
public: | |
const vector<vector<double> >& data; | |
const vector<bicluster*>& clusters; | |
distances_t& distances; | |
pair_t lowest_pair; | |
double closest; | |
ReduceHCluster(const vector<vector<double> >& data_, const vector<bicluster*>& clusters_, distances_t& distances_, pair_t lowest_pair_, double closest_) | |
: data(data_), clusters(clusters_), distances(distances_), lowest_pair(lowest_pair_), closest(closest_) { } | |
ReduceHCluster(const ReduceHCluster& hc, split) | |
: data(hc.data), clusters(hc.clusters), distances(hc.distances), lowest_pair(hc.lowest_pair), closest(hc.closest) { } | |
void operator()(const blocked_range<vec_t>& r) { | |
for (vec_t p = r.begin(); p != r.end(); ++p) { | |
for (vec_t q = p + 1; q != clusters.end(); ++q) { | |
int i = min((*p)->id, (*q)->id); | |
int j = max((*p)->id, (*q)->id); | |
if (distances[table_i[i] + j] < 0) | |
distances[table_i[i] + j] = calc_cluster_distance((*p)->vec_ids, (*q)->vec_ids, distances); | |
double d = distances[table_i[i] + j]; | |
if (d < closest) { | |
closest = d; | |
lowest_pair = make_pair(p, q); | |
} | |
} | |
} | |
} | |
void join(const ReduceHCluster& hc) { | |
if (closest > hc.closest) { | |
closest = hc.closest; | |
lowest_pair = hc.lowest_pair; | |
} | |
} | |
}; | |
class HClusterDistances { | |
private: | |
const vector<vector<double> >& data; | |
int n; | |
distances_t& distances; | |
public: | |
HClusterDistances(const vector<vector<double> >& data_, int n_, distances_t& distances_) | |
: data(data_), n(n_), distances(distances_) { } | |
void operator()(const blocked_range<int>& r) const { | |
for (int i = r.begin(); i != r.end(); i++) | |
for (int j = i + 1; j < n; j++) | |
distances[table_i[i] + j] = calc_distance(data[i], data[j]); | |
} | |
}; | |
inline int index_i(int n, int max_n) | |
{ | |
int s = 0; | |
for (int i = 0; i < n; i++) s += max_n - i; | |
return s - n - 1; | |
} | |
void hcluster(const vector<vector<double> >& data, vector<bicluster*>& clusters) | |
{ | |
int n = clusters.size() * 2; | |
for (int i = 0; i < n; i++) table_i.push_back(index_i(i, n - 1)); | |
// distance between i and j: distances[table_i(i) + j] | |
distances_t distances(n * (n - 1) / 2, -1.0); | |
int current_clust_id = clusters.size(); | |
parallel_for(blocked_range<int>(0, clusters.size() - 1, 100), HClusterDistances(data, clusters.size(), distances)); | |
while (clusters.size() > 1) { | |
cerr << clusters.size() << endl; | |
pair_t lowest_pair = make_pair(clusters.begin(), clusters.begin() + 1); | |
double closest = calc_cluster_distance((*lowest_pair.first)->vec_ids, (*lowest_pair.second)->vec_ids, distances); | |
ReduceHCluster hc(data, clusters, distances, lowest_pair, closest); | |
parallel_reduce(blocked_range<vec_t>(clusters.begin(), clusters.end(), 50), hc); | |
closest = hc.closest; | |
lowest_pair = hc.lowest_pair; | |
vector<int> vec_ids((*lowest_pair.first)->vec_ids); | |
vec_ids.insert(vec_ids.end(), (*lowest_pair.second)->vec_ids.begin(), (*lowest_pair.second)->vec_ids.end()); | |
vector<int>().swap((*lowest_pair.second)->vec_ids); | |
vector<int>().swap((*lowest_pair.first)->vec_ids); | |
bicluster* new_cluster = new bicluster(*lowest_pair.first, *lowest_pair.second, current_clust_id, vec_ids, closest); | |
current_clust_id++; | |
clusters.erase(lowest_pair.second); | |
clusters.erase(lowest_pair.first); | |
clusters.push_back(new_cluster); | |
} | |
} | |
void read_data(const string fname, vector<vector<double> >& data) | |
{ | |
fstream fs(fname.c_str(), ios_base::in); | |
string line; | |
stringstream ss; | |
vector<double> v; | |
double value; | |
while (getline(fs, line)) { | |
for (ss.str(line), v.clear(); ss >> value; v.push_back(value)); | |
ss.clear(); | |
data.push_back(v); | |
} | |
} | |
void store_clusters(const bicluster* cluster, vector<pair<double, distance_pair_t> >& cluster_data, int max_n) | |
{ | |
static int n = 0; | |
if (cluster->left && cluster->right) | |
cluster_data.push_back(make_pair(cluster->distance, make_pair( | |
cluster->left->id < max_n ? cluster->left->id : -(cluster->left->id - max_n + 1), | |
cluster->right->id < max_n ? cluster->right->id : -(cluster->right->id - max_n + 1)))); | |
if (cluster->left) store_clusters(cluster->left, cluster_data, max_n); | |
if (cluster->right) store_clusters(cluster->right, cluster_data, max_n); | |
} | |
void write_clusters(const string fname, const vector<pair<double, distance_pair_t> >& cluster_data) | |
{ | |
fstream fs(fname.c_str(), ios_base::out | ios_base::trunc); | |
for (vector<pair<double, distance_pair_t> >::const_iterator p = cluster_data.begin(); p != cluster_data.end(); ++p) { | |
fs << p->first << " " << p->second.first << " " << p->second.second << endl; | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
if (argc < 3) { | |
cerr << "Hierarchical clustering using the TBB library." << endl; | |
cerr << endl; | |
cerr << " Usage: " << argv[0] << " input_data_file output_cluster_file" << endl; | |
exit(1); | |
} | |
task_scheduler_init init; | |
vector<vector<double> > data; | |
vector<bicluster*> clusters; | |
read_data(argv[1], data); | |
for (int i = 0; i < data.size(); i++) | |
clusters.push_back(new bicluster(NULL, NULL, i, vector<int>(1, i), 0.0)); | |
hcluster(data, clusters); | |
vector<pair<double, distance_pair_t> > cluster_data; | |
store_clusters(clusters[0], cluster_data, data.size()); | |
parallel_sort(cluster_data.begin(), cluster_data.end()); | |
write_clusters(argv[2], cluster_data); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Draw dendrogram for hierarchical clustering | |
by nox, 2009.07.02 | |
""" | |
import sys, os | |
from PIL import Image, ImageDraw | |
HEAD_LENGTH = 50 | |
TAIL_LENGTH = 150 | |
WIDTH = 1200 | |
HEIGHT = 20 | |
class bicluster: | |
def __init__(self, left=None, right=None, distance=0.0, id=None): | |
self.left = left | |
self.right = right | |
self.id = id | |
self.distance = distance | |
def find_cluster(clust, id): | |
for i, c in enumerate(clust): | |
if c.id == id: return i | |
return None | |
def read_clusters(fname, clust, labels): | |
has_labels = False | |
if labels: has_labels = True | |
lines = file(fname).readlines() | |
num = len(lines) + 1 | |
for i in range(num): | |
clust.append(bicluster(id=i)) | |
if not has_labels: labels.append(i) | |
for i, l in enumerate(lines): | |
i = -(i + 1) | |
data = l.strip().split() | |
left_id = find_cluster(clust, int(data[1])) | |
right_id = find_cluster(clust, int(data[2])) | |
clust.append(bicluster(id=i, | |
left=clust[left_id], | |
right=clust[right_id], | |
distance=float(data[0]))) | |
del clust[right_id] | |
del clust[left_id] | |
def get_height(clust): | |
if clust.left == None and clust.right == None: return 1 | |
return get_height(clust.left) + get_height(clust.right) | |
def get_depth(clust): | |
if clust.left == None and clust.right == None: return 0 | |
return max(get_depth(clust.left), get_depth(clust.right)) + clust.distance | |
def draw_dendrogram(clust, labels, image_file): | |
h = get_height(clust) * HEIGHT | |
depth = clust.distance | |
scaling = float(WIDTH - (HEAD_LENGTH + TAIL_LENGTH)) / depth | |
img = Image.new("RGB", (WIDTH, h), (255, 255, 255)) | |
draw = ImageDraw.Draw(img) | |
draw.line((0, h / 2, HEAD_LENGTH, h / 2), fill=(255, 0, 0)) | |
draw_node(draw, clust, HEAD_LENGTH, (h / 2), scaling, labels) | |
img.save(image_file) | |
def draw_node(draw, clust, x, y, scaling, labels): | |
if clust.id < 0: | |
h1 = get_height(clust.left) * HEIGHT | |
h2 = get_height(clust.right) * HEIGHT | |
top = y - (h1 + h2) / 2 | |
bottom = y + (h1 + h2) / 2 | |
ll = clust.distance * scaling | |
lll = ll - clust.left.distance * scaling | |
llr = ll - clust.right.distance * scaling | |
draw.line((x, top + h1 / 2, x, bottom - h2 / 2), fill=(255, 0, 0)) | |
draw.line((x, top + h1 / 2, x + lll, top + h1 / 2), fill=(255, 0, 0)) | |
draw.line((x, bottom - h2 / 2, x + llr, bottom - h2 / 2), fill=(255, 0, 0)) | |
draw_node(draw, clust.left, x + lll, top + h1 / 2, scaling, labels) | |
draw_node(draw, clust.right, x + llr, bottom - h2 / 2, scaling, labels) | |
else: | |
draw.line((x, y, WIDTH - TAIL_LENGTH, y), fill=(255, 0, 0)) | |
draw.text((WIDTH - TAIL_LENGTH + 5, y - 7), str(labels[clust.id]), (0, 0, 0)) | |
def main(args): | |
if len(args) < 2: | |
print >>sys.stderr, "Draw dendrogram for hierarchical clustering." | |
print >>sys.stderr | |
print >>sys.stderr, " Usage: %s cluster_data_file [image_file=clusters.jpg label_data_file=[]]" % os.path.basename(args[0]) | |
sys.exit(1) | |
labels = [] | |
if len(args) > 2: image_file = args[2] | |
else: image_file = "clusters.jpg" | |
if len(args) > 3: | |
for l in file(args[3]): | |
labels.append(l.strip()) | |
clust = [] | |
read_clusters(args[1], clust, labels) | |
draw_dendrogram(clust[0], labels, image_file) | |
if __name__ == "__main__": main(sys.argv) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Grouping clustering data | |
by nox, 2009.07.02 | |
""" | |
import sys, os, math, re | |
def expand_group(clust, n): | |
group = [] | |
for x in clust[n][1:]: | |
if x >= 0: group.append(x) | |
else: group.extend(expand_group(clust, -(x + 1))) | |
return group | |
def arrange_cluster(clust, threshold): | |
tt = [[i, data[1:]] for i, data in enumerate(clust) if data[0] > threshold] | |
t = [] | |
for d in tt: | |
for x in d[1]: | |
if -x < tt[0][0] + 1: t.append(x) | |
groups = [] | |
for x in t: | |
g = [] | |
if x >= 0: g = [x] | |
else: g = expand_group(clust, -(x + 1)) | |
groups.append(g) | |
if not groups: groups = [range(1, len(clust) + 2)] | |
return groups | |
def read_clust(fname, clust_data): | |
for l in file(fname): | |
data = l.strip().split() | |
clust_data.append((float(data[0]), int(data[1]), int(data[2]))) | |
def output_group(fname, groups): | |
f = file(fname, "w") | |
count = 1 | |
for c in groups: | |
f.write("%03d:" % count) | |
for d in c: f.write(" %d" % d) | |
f.write("\n") | |
count += 1 | |
f.close() | |
def main(args): | |
if len(args) < 4: | |
print >>sys.stderr, "Grouping clustering data." | |
print >>sys.stderr | |
print >>sys.stderr, " Usage: %s cluster_data_file output_group_file threshold" % os.path.basename(args[0]) | |
sys.exit(1) | |
clust_file = args[1] | |
group_file = args[2] | |
threshold = float(args[3]) | |
print "Input cluster file: %s" % clust_file | |
print "Output group file : %s" % group_file | |
print "Threshold : %f" % threshold | |
clust_data = [] | |
read_clust(clust_file, clust_data) | |
groups = arrange_cluster(clust_data, threshold) | |
print "Number of elements: %d" % sum([len(x) for x in groups]) | |
print "Number of clusters: %d" % len(groups) | |
output_group(group_file, groups) | |
if __name__ == "__main__": main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://handasse.blogspot.com/2009/07/c-cpu.html
http://handasse.blogspot.com/2009/07/python.html