-
-
Save bcthomas/a4677f95e4a5a864e331 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Interface for the The C clustering library | |
void clusterlibrary::cluster(std::vector< std::vector<float> > & data, int k, int iterations, std::vector<int> & clusterid) | |
{ | |
int nrows = data.size(); | |
int ncolumns = data.front().size(); | |
float** data_ptr = new float*[nrows]; | |
for(size_t i = 0; i < data.size(); i++) data_ptr[i] = &data[i][0]; | |
std::vector< std::vector<int> > mask(nrows, std::vector<int>(ncolumns, 1)); | |
int ** mask_ptr = new int*[nrows]; | |
for(size_t i = 0; i < mask.size(); i++) mask_ptr[i] = &mask[i][0]; | |
std::vector<float> data_weights(data.size(),1.0); | |
float * weights = &data_weights[0]; | |
char distType = 'e'; | |
std::vector< std::vector<float> > cdata(k, std::vector<float>(ncolumns, 0)); | |
float ** cdata_ptr = new float*[ncolumns]; | |
for(size_t i = 0; i < cdata.size(); i++) cdata_ptr[i] = &cdata[i][0]; | |
std::vector< std::vector<int> > cmask(k, std::vector<int>(ncolumns, 1)); | |
int ** cmask_ptr = new int*[ncolumns]; | |
for(size_t i = 0; i < cmask.size(); i++) cmask_ptr[i] = &cmask[i][0]; | |
float err = 0; | |
std::vector< int > tclusterid(nrows, 0); | |
std::vector< int > counts(k, 0); | |
std::vector< int > mapping(k, 0); | |
clusterid.clear(); | |
clusterid.resize(nrows, 0); | |
clusterlib::kmeans(k, nrows, ncolumns, data_ptr, mask_ptr, weights, 0, iterations, | |
distType, cdata_ptr, cmask_ptr, &clusterid[0], &err, &tclusterid[0], &counts[0], &mapping[0] ); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
// Code adapted from https://github.com/propanoid/DBSCAN | |
#include <vector> | |
#include <algorithm> | |
#include <omp.h> | |
// Any basic vector/matrix library should also work | |
#include <Eigen/Core> | |
namespace clustering | |
{ | |
template<typename Vector, typename Matrix> | |
class DBSCAN | |
{ | |
public: | |
typedef Vector FeaturesWeights; | |
typedef Matrix ClusterData; | |
typedef Matrix DistanceMatrix; | |
typedef std::vector<unsigned int> Neighbors; | |
typedef std::vector<int> Labels; | |
private: | |
double m_eps; | |
size_t m_min_elems; | |
double m_dmin; | |
double m_dmax; | |
Labels m_labels; | |
public: | |
// 'eps' is the search space for neighbors in the range [0,1], where 0.0 is exactly self and 1.0 is entire dataset | |
DBSCAN(double eps, size_t min_elems) | |
: m_eps( eps ) | |
, m_min_elems( min_elems ) | |
, m_dmin(0.0) | |
, m_dmax(0.0) | |
{ | |
reset(); | |
} | |
// Call this to perform clustering, get results by calling 'get_labels()' | |
void fit( const ClusterData & C ) | |
{ | |
const FeaturesWeights W = std_weights( C.cols() ); | |
wfit( C, W ); | |
} | |
const Labels & get_labels() const | |
{ | |
return m_labels; | |
} | |
void reset() | |
{ | |
m_labels.clear(); | |
} | |
void init(double eps, size_t min_elems) | |
{ | |
m_eps = eps; | |
m_min_elems = min_elems; | |
} | |
// Useful for testing | |
static ClusterData gen_cluster_data( size_t features_num, size_t elements_num ) | |
{ | |
ClusterData cl_d( elements_num, features_num ); | |
for (size_t i = 0; i < elements_num; ++i) | |
for (size_t j = 0; j < features_num; ++j) | |
cl_d(i, j) = (-1.0 + rand() * (2.0) / RAND_MAX); | |
return cl_d; | |
} | |
FeaturesWeights std_weights( size_t s ) | |
{ | |
// num cols | |
FeaturesWeights ws( s ); | |
for (size_t i = 0; i < s; ++i) | |
ws(i) = 1.0; | |
return ws; | |
} | |
void fit_precomputed( const DistanceMatrix & D ) | |
{ | |
prepare_labels( D.rows() ); | |
dbscan( D ); | |
} | |
void wfit( const ClusterData & C, const FeaturesWeights & W ) | |
{ | |
prepare_labels( C.rows() ); | |
const DistanceMatrix D = calc_dist_matrix( C, W ); | |
dbscan( D ); | |
} | |
private: | |
void prepare_labels( size_t s ) | |
{ | |
m_labels.resize(s, -1); | |
} | |
Neighbors find_neighbors(const DistanceMatrix & D, unsigned int pid) | |
{ | |
Neighbors ne; | |
for (unsigned int j = 0; j < D.rows(); ++j) | |
{ | |
if ( D(pid, j) <= m_eps ) | |
{ | |
ne.push_back(j); | |
} | |
} | |
return ne; | |
} | |
const DistanceMatrix calc_dist_matrix( const ClusterData & C, const FeaturesWeights & W ) | |
{ | |
ClusterData cl_d = C; | |
#pragma omp parallel for | |
for (int i = 0; i < (int)cl_d.cols(); ++i) | |
{ | |
auto col = cl_d.col(i); | |
const auto r = std::minmax_element( col.data(), col.data() + col.size() ); | |
double data_min = *r.first; | |
double data_range = *r.second - *r.first; | |
if (data_range == 0.0) { data_range = 1.0; } | |
const double scale = 1/data_range; | |
const double min = -1.0*data_min*scale; | |
col *= scale; | |
col += Vector::Constant(col.size(), min); | |
cl_d.col(i) = col; | |
} | |
// rows x rows | |
DistanceMatrix d_m( cl_d.rows(), cl_d.rows() ); | |
Vector d_max( cl_d.rows() ); | |
Vector d_min( cl_d.rows() ); | |
for (int i = 0; i < (int)cl_d.rows(); ++i) | |
{ | |
#pragma omp parallel for | |
for (int j = i; j < (int)cl_d.rows(); ++j) | |
{ | |
d_m(i, j) = 0.0; | |
if (i != j) | |
{ | |
Vector U = cl_d.row(i); | |
Vector V = cl_d.row(j); | |
Vector diff = ( U-V ); | |
for(int k = 0; k < (int)diff.size(); k++) | |
{ | |
auto e = diff[k]; | |
d_m(i, j) += fabs(e)*W[k]; | |
} | |
d_m(j, i) = d_m(i, j); | |
} | |
} | |
const auto cur_row = d_m.row(i); | |
const auto mm = std::minmax_element( cur_row.data(), cur_row.data() + cur_row.size() ); | |
d_max(i) = *mm.second; | |
d_min(i) = *mm.first; | |
} | |
m_dmin = *(std::min_element( d_min.data(), d_min.data() + d_min.size() )); | |
m_dmax = *(std::max_element( d_max.data(), d_max.data() + d_max.size() )); | |
m_eps = (m_dmax - m_dmin) * m_eps + m_dmin; | |
return d_m; | |
} | |
void dbscan( const DistanceMatrix & dm ) | |
{ | |
std::vector<unsigned int> visited( dm.rows() ); | |
unsigned int cluster_id = 0; | |
for (unsigned int pid = 0; pid < dm.rows(); ++pid) | |
{ | |
if ( !visited[pid] ) | |
{ | |
visited[pid] = 1; | |
Neighbors ne = find_neighbors(dm, pid ); | |
if (ne.size() >= m_min_elems) | |
{ | |
m_labels[pid] = cluster_id; | |
for (unsigned int i = 0; i < ne.size(); ++i) | |
{ | |
unsigned int nPid = ne[i]; | |
if ( !visited[nPid] ) | |
{ | |
visited[nPid] = 1; | |
Neighbors ne1 = find_neighbors(dm, nPid); | |
if ( ne1.size() >= m_min_elems ) | |
{ | |
for (const auto & n1 : ne1) | |
{ | |
ne.push_back(n1); | |
} | |
} | |
} | |
if ( m_labels[nPid] == -1 ) | |
{ | |
m_labels[nPid] = cluster_id; | |
} | |
} | |
++cluster_id; | |
} | |
} | |
} | |
} | |
}; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Incomplete.. | |
//Generalized k-means | |
//Copyright (C) 2013 Balazs Szalkai | |
//If you use this program in your research, please cite the following article: | |
//B. Szalkai: An implementation of the relational k-means algorithm. ArXiv e-prints, 2013. | |
namespace GeneralizedKMeans | |
{ | |
typedef Eigen::MatrixXf DistanceMatrix; | |
struct Clusters | |
{ | |
DistanceMatrix Matrix; | |
int Count; | |
int[] ObjectToCluster; | |
List<int>[] ClusterToObjects; | |
CentroidDistance cd; | |
Clusters(DistanceMatrix & matrix, int count) | |
{ | |
Matrix = matrix; | |
Count = count; | |
ObjectToCluster = new int[Matrix.Count]; | |
ClusterToObjects = new List<int>[Count]; | |
for (int i = 0; i < Count; i++) ClusterToObjects[i] = new List<int>(); | |
Randomize(); | |
} | |
static Random Rand = new Random(); | |
void Randomize() | |
{ | |
for (int i = 0; i < Count; i++) ClusterToObjects[i].Clear(); | |
for (int i = 0; i < Matrix.Count; i++) | |
{ | |
int cluster = Rand.Next(Count); | |
ObjectToCluster[i] = cluster; | |
ClusterToObjects[cluster].Add(i); | |
} | |
} | |
void Make_ClusterToObjects() | |
{ | |
for (int cluster = 0; cluster < Count; cluster++) | |
ClusterToObjects[cluster].Clear(); | |
for (int i = 0; i < Matrix.Count; i++) | |
ClusterToObjects[ObjectToCluster[i]].Add(i); | |
} | |
double[] AdditiveConstant; | |
// Call this before using the other members functions | |
void Initialize() | |
{ | |
// calculate additive constants for clusters | |
AdditiveConstant = new double[clusters.Count]; | |
for (int cluster = 0; cluster < clusters.Count; cluster++) | |
{ | |
var objects = clusters.ClusterToObjects[cluster]; | |
double x = 0; | |
foreach (var i in objects) | |
foreach (var j in objects) | |
x += clusters.Matrix.SqrDistance[i, j]; | |
AdditiveConstant[cluster] = -x / (2 * Utils.Sqr((double)objects.Count)); | |
} | |
} | |
// Calculates a squared centroid distance | |
double CalculateSqrDist(int obj, int cluster) | |
{ | |
var objects = clusters.ClusterToObjects[cluster]; | |
double x = 0; | |
foreach (int j in objects) x += clusters.Matrix.SqrDistance[obj, j]; | |
double dist = AdditiveConstant[cluster] + x / objects.Count; | |
return dist; | |
} | |
int GetNearestCluster(int obj) | |
{ | |
int nearestCluster = -1; | |
double minSqrDist = 0; | |
for (int cluster = 0; cluster < clusters.Count; cluster++) | |
{ | |
double sqrDist = CalculateSqrDist(obj, cluster); | |
if (nearestCluster < 0 || sqrDist < minSqrDist) | |
{ | |
minSqrDist = sqrDist; | |
nearestCluster = cluster; | |
} | |
} | |
return nearestCluster; | |
} | |
double GetClusteringValue() | |
{ | |
double result = 0; | |
for (int i = 0; i < clusters.Matrix.Count; i++) | |
result += CalculateSqrDist(i, clusters.ObjectToCluster[i]); | |
return result; | |
} | |
// Returns how many objects have changed cluster | |
int Iterate() | |
{ | |
int changed = 0; | |
cd.Initialize(); | |
// find the nearest centroid for the objects | |
int[] New_ObjectToCluster = new int[Matrix.Count]; | |
for (int i = 0; i < Matrix.Count; i++) | |
{ | |
New_ObjectToCluster[i] = cd.GetNearestCluster(i); | |
if (New_ObjectToCluster[i] != ObjectToCluster[i]) changed++; | |
} | |
// update the configuration | |
double oldValue = GetValue(); | |
int[] Old_ObjectToCluster = ObjectToCluster; | |
ObjectToCluster = New_ObjectToCluster; | |
Make_ClusterToObjects(); | |
double newValue = GetValue(); | |
// clustering got worse (this is possible with some distance matrices) or stayed the same? | |
if (oldValue <= newValue) | |
{ | |
// undo iteration | |
ObjectToCluster = Old_ObjectToCluster; | |
Make_ClusterToObjects(); | |
return 0; | |
} | |
return changed; | |
} | |
double GetValue() | |
{ | |
cd.Initialize(); | |
return cd.GetClusteringValue(); | |
} | |
} | |
void Run( DistanceMatrix & m, int nClusters ) | |
{ | |
Clusters bestClusters (m, nClusters); | |
int badLuckStreak = 0; | |
int blockId = 0; | |
int maxBadLuckStreak = 100; | |
while (true) | |
{ | |
for (int i = 0; i < nThreads; i++) | |
{ | |
Clusters c = new Clusters(m, nClusters); | |
while (true) | |
{ | |
if (c.Iterate() == 0) break; | |
} | |
Clusters c = clustersArray[i]; | |
if (c.GetValue() < bestClusters.GetValue()) | |
{ | |
bestClusters = c; | |
badLuckStreak = 0; | |
} | |
else | |
badLuckStreak++; | |
} | |
if (badLuckStreak >= maxBadLuckStreak) break; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Works well | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
// Copyright (c) 2010, Lawrence Livermore National Security, LLC. | |
// Produced at the Lawrence Livermore National Laboratory | |
////////////////////////////////////////////////////////////////////////////////////////////////// | |
/// @author Todd Gamblin [email protected] | |
#include <vector> | |
#include <sstream> | |
#include <algorithm> | |
#include <numeric> | |
#include <cassert> | |
#include <cstdlib> | |
using namespace std; | |
//#include "random.h" | |
//#include "bic.h" | |
//#include "matrix_utils.h" | |
namespace clustering { | |
typedef float Scalar; | |
typedef Eigen::Matrix<Scalar,-1,-1> dissimilarity_matrix; | |
typedef size_t medoid_id; ///< More descriptive type for medoid index | |
typedef size_t object_id; ///< More descriptive type for object index | |
/// | |
/// Explicit representation of a clustering. Instead of a vecto of representative | |
/// ids, this has <i>k</i> sets of object_ids indicating which objects are in a | |
/// particular cluster. You can convert a partition to a cluster_list with | |
/// to_cluster_list(). | |
/// | |
typedef std::vector< std::set<object_id> > cluster_list; | |
struct kmedoids{ | |
/// Gives the index of the object that is the ith medoid. | |
/// medoids[i] == index in object array for last call to findClusters() | |
std::vector<object_id> medoid_ids; | |
/// Gives cluster id (index in medoids) for the ith object. | |
/// clusterid[i] == id of cluster of which object i is a member. | |
/// medoids[clusterid[i]] == representative of that cluster. | |
std::vector<medoid_id> cluster_ids; | |
std::vector<medoid_id> sec_nearest; /// Index of second closest medoids. Used by PAM. | |
double total_dissimilarity; /// Total dissimilarity bt/w objects and their medoid | |
bool sort_medoids; /// Whether medoids should be canonically sorted by object id. | |
double epsilon; /// Normalized sensitivity for convergence | |
size_t init_size; /// initial sample size (before 2*k) | |
size_t max_reps; /// initial sample size (before 2*k) | |
typedef std::mt19937 random_type; /// Type for RNG used in this algorithm | |
kmedoids(size_t num_objects) | |
: cluster_ids(num_objects, 0), | |
total_dissimilarity(std::numeric_limits<double>::infinity()), | |
sort_medoids(false), | |
epsilon(1e-15), | |
init_size(40), | |
max_reps(5) | |
{ if (num_objects) medoid_ids.resize(1); } | |
double average_dissimilarity() const { | |
return total_dissimilarity / cluster_ids.size(); | |
} | |
void set_sort_medoids(bool sort) { | |
sort_medoids = sort; | |
} | |
void set_epsilon(double e) { | |
epsilon = e; | |
} | |
void init_medoids(size_t k, const dissimilarity_matrix& distance) { | |
medoid_ids.clear(); | |
// find first object: object minimum dissimilarity to others | |
object_id first_medoid = 0; | |
double min_dissim = std::numeric_limits<Scalar>::max(); | |
for (size_t i=0; i < distance.rows(); i++) { | |
double total = 0.0; | |
for (size_t j=0; j < distance.cols(); j++) { | |
total += distance(i,j); | |
} | |
if (total < min_dissim) { | |
min_dissim = total; | |
first_medoid = i; | |
} | |
} | |
// add first object to medoids and compute medoid ids. | |
medoid_ids.push_back(first_medoid); | |
assign_objects_to_clusters(distance); | |
// now select next k-1 objects according to KR's BUILD algorithm | |
for (size_t cur_k = 1; cur_k < k; cur_k++) { | |
object_id best_obj = 0; | |
double max_gain = 0.0; | |
for (size_t i=0; i < distance.rows(); i++) { | |
if (is_medoid(i)) continue; | |
double gain = 0.0; | |
#pragma omp parallel for reduction(+:gain) | |
for (int j=0; j < (int)distance.rows(); j++) { | |
double Dj = distance(j, medoid_ids[cluster_ids[j]]); // distance from j to its medoid | |
gain += max(Dj - distance(i,j), 0.0); // gain from selecting i | |
} | |
if (gain >= max_gain) { // set the next medoid to the object that | |
max_gain = gain; // maximizes the gain function. | |
best_obj = i; | |
} | |
} | |
medoid_ids.push_back(best_obj); | |
assign_objects_to_clusters(distance); | |
} | |
} | |
double cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const { | |
double total = 0; | |
for (object_id j = 0; j < cluster_ids.size(); j++) { | |
object_id mi = medoid_ids[i]; // object id of medoid i | |
double dhj = distance(h, j); // distance between object h and object j | |
object_id mj1 = medoid_ids[cluster_ids[j]]; // object id of j's nearest medoid | |
double dj1 = distance(mj1, j); // distance to j's nearest medoid | |
// check if distance bt/w medoid i and j is same as j's current nearest medoid. | |
if (distance(mi, j) == dj1) { | |
double dj2 = std::numeric_limits<Scalar>::max(); | |
if (medoid_ids.size() > 1) { // look at 2nd nearest if there's more than one medoid. | |
object_id mj2 = medoid_ids[sec_nearest[j]]; // object id of j's 2nd-nearest medoid | |
dj2 = distance(mj2, j); // distance to j's 2nd-nearest medoid | |
} | |
total += min(dj2, dhj) - dj1; | |
} else if (dhj < dj1) { | |
total += dhj - dj1; | |
} | |
} | |
return total; | |
} | |
/// True if and only if object i is a medoid. | |
bool is_medoid(object_id oi) const { | |
return medoid_ids[cluster_ids[oi]] == oi; | |
} | |
void pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids) { | |
if (k > distance.rows()) { | |
throw std::logic_error("Attempt to run PAM with more clusters than data."); | |
} | |
if (distance.rows() != distance.cols()) { | |
throw std::logic_error("Error: distance matrix is not square!"); | |
} | |
// first get this the right size. | |
cluster_ids.resize(distance.rows()); | |
// size cluster_ids appropriately and randomly pick initial medoids | |
if (initial_medoids) { | |
medoid_ids.clear(); | |
copy(initial_medoids, initial_medoids + k, back_inserter(medoid_ids)); | |
} else { | |
init_medoids(k, distance); | |
} | |
// set tolerance equal to epsilon times mean magnitude of distances. | |
// Note that distances *should* all be non-negative. | |
double tolerance = epsilon * distance.sum() / (distance.rows() * distance.cols()); | |
while (true) { | |
// initial cluster setup | |
total_dissimilarity = assign_objects_to_clusters(distance); | |
//vars to keep track of minimum | |
double minTotalCost = std::numeric_limits<Scalar>::max(); | |
medoid_id minMedoid = 0; | |
object_id minObject = 0; | |
//iterate over each medoid | |
for (medoid_id i=0; i < k; i++) { | |
//iterate over all non-medoid objects | |
for (object_id h = 0; h < cluster_ids.size(); h++) { | |
if (is_medoid(h)) continue; | |
//see if the total cost of swapping i & h was less than min | |
double curCost = cost(i, h, distance); | |
if (curCost < minTotalCost) { | |
minTotalCost = curCost; | |
minMedoid = i; | |
minObject = h; | |
} | |
} | |
} | |
// bail if we can't gain anything more (we've converged) | |
if (minTotalCost >= -tolerance) break; | |
// install the new medoid if we found a beneficial swap | |
medoid_ids[minMedoid] = minObject; | |
cluster_ids[minObject] = minMedoid; | |
} | |
//if (sort_medoids) sort(); | |
} | |
/// Assign each object to the cluster with the closest medoid. | |
/// | |
/// @return Total dissimilarity of objects w/their medoids. | |
/// | |
/// @param distance a callable object that computes distances between indices, as a distance | |
/// matrix would. Algorithms are free to use real distance matrices (as in PAM) | |
/// or to compute lazily (as in CLARA medoid assignment). | |
double assign_objects_to_clusters(const dissimilarity_matrix& distance) { | |
if (sec_nearest.size() != cluster_ids.size()) { | |
sec_nearest.resize(cluster_ids.size()); | |
} | |
// go through and assign each object to nearest medoid, keeping track of total dissimilarity. | |
double total_dissimilarity = 0; | |
#pragma omp parallel for reduction(+:total_dissimilarity) | |
for (int i=0; i < (int)cluster_ids.size(); i++) { | |
double d1, d2; // smallest, second smallest distance to medoid, respectively | |
medoid_id m1, m2; // index of medoids with distances d1, d2 from object i, respectively | |
d1 = d2 = std::numeric_limits<Scalar>::max(); | |
m1 = m2 = medoid_ids.size(); | |
for (medoid_id m=0; m < medoid_ids.size(); m++) { | |
double d = distance(i, medoid_ids[m]); | |
if (d < d1 || medoid_ids[m] == i) { // prefer the medoid in case of ties. | |
d2 = d1; m2 = m1; | |
d1 = d; m1 = m; | |
} else if (d < d2) { | |
d2 = d; m2 = m; | |
} | |
} | |
cluster_ids[i] = m1; | |
sec_nearest[i] = m2; | |
total_dissimilarity += d1; | |
} | |
return total_dissimilarity; | |
} | |
}; | |
} // namespace cluster |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
/* Meanshift non-parametric mode estimator. | |
* | |
* Code adapted from : Sebastian Nowozin <[email protected]> | |
*/ | |
#define _USE_MATH_DEFINES | |
#include <math.h> | |
#include <vector> | |
#include <map> | |
#include <iostream> | |
//#include <gmm/gmm.h> | |
#include <Eigen/Core> | |
namespace gmm{ | |
template<typename V> | |
void clear(V & v){ | |
for(size_t i = 0; i < v.size(); i++) | |
v[i] = 0; | |
} | |
template<typename V, typename W> | |
void copy( const V & v, W & w ){ | |
for(size_t i = 0; i < v.size(); i++) | |
w[i] = v[i]; | |
} | |
template<typename V, typename W> | |
void add( const V & v, W & w ){ | |
for(size_t i = 0; i < w.size(); i++) | |
w[i] += v[i]; | |
} | |
template<typename V> | |
V scaled(const V & v, double s){ | |
V a = v; | |
for(size_t i = 0; i < a.size(); i++) | |
a[i] *= s; | |
return a; | |
} | |
template<typename V> | |
void scale( V & v, double s ){ | |
for(size_t i = 0; i < v.size(); i++) | |
v[i] *= s; | |
} | |
template<typename V, typename W> | |
double vect_dist2(V & v, W & w){ | |
Eigen::VectorXd a = Eigen::Map<Eigen::VectorXd>(&v[0], v.size()); | |
Eigen::VectorXd b = Eigen::Map<Eigen::VectorXd>(&w[0], w.size()); | |
return (a-b).lpNorm<2>(); | |
} | |
template<typename V> | |
double vect_norm2(V & v){ | |
return Eigen::Map<Eigen::VectorXd>(&v[0], v.size()).norm(); | |
} | |
} | |
namespace clustering { | |
/* We use the notation of the description of Mean Shift in | |
* [Comaniciu2000], Dorin Comaniciu, Peter Meer, | |
* "Mean Shift: A Robust Approach toward Feature Space Analysis" | |
* | |
* The implementation is naive and does not exploit fast nearest neighbor | |
* lookups or triangle inequalities to speed up the mean shift procedure. | |
* Therefore, it is only suitable for low-dimensional input spaces (say, <= | |
* 10) with relatively few samples (say, < 1e6). | |
*/ | |
class Meanshift { | |
public: | |
/* kernel_type selects the kernel profile: | |
* 0 for the Epanechnikov kernel profile | |
* k_E(x) = (1 - x) if (0 <= x <= 1), 0 otherwise. | |
* 1 for the truncated multivariate normal kernel profile | |
* k_N(x) = exp(-0.5 * x) | |
* kernel_bandwidth: The positive bandwidth parameter. | |
* mode_tolerance: mode matching tolerance. Modes which have a L2 | |
* distance closer than this value will be treated as being the same. | |
*/ | |
Meanshift(int kernel_type, double kernel_bandwidth, | |
int dim, double mode_tolerance); | |
/* Find modes of an empirical distribution X. | |
* | |
* X: N vectors of size M, representing N samples from the distribution. | |
* modes: Output, will be allocated properly. Return all modes found. | |
* indexmap: N-vector of absolute mode indices. Each sample point is | |
* assigned to one mode. The vector must already be properly sized. | |
* procedurecount: The mean shift procedure will be run at least this many | |
* times, sampling randomly from X as initialization. If zero, all | |
* samples from X are used as initializations. | |
*/ | |
void FindModes(const std::vector<std::vector<double> >& X, | |
std::vector<std::vector<double> >& modes, | |
std::vector<int>& indexmap, | |
unsigned int procedure_count = 0) const; | |
/* Mean Shift Procedure, starting from mode, perform mean shift on the | |
* distribution empirically sampled in X. | |
* | |
* X: N vectors of size M, representing N samples from the distribution. | |
* mode: Starting point (for example a point from X). The result will be | |
* given in mode. | |
* visited: N-vector of indicator variables. If the mean shift procedure | |
* passes through a point in X, the corresponding index in visited will | |
* be set to one. | |
* | |
* Return the number of iterations used. | |
*/ | |
int MeanshiftProcedure(const std::vector<std::vector<double> >& X, | |
std::vector<double>& mode, std::vector<unsigned int>& visited) const; | |
private: | |
int dim; // input space dimension | |
int kernel_type; // 0: Epanechnikov, 1: truncated multivariate normal | |
double kernel_c; // constant normalization factor | |
double kernel_bandwidth; | |
double mode_tolerance; | |
}; | |
Meanshift::Meanshift(int kernel_type, double kernel_bandwidth, | |
int dim, double mode_tolerance) | |
: dim(dim), kernel_type(kernel_type), | |
kernel_bandwidth(kernel_bandwidth), | |
mode_tolerance(mode_tolerance) { | |
assert(kernel_type >= 0 && kernel_type <= 1); | |
assert(kernel_bandwidth > 0.0); | |
assert(dim > 0); | |
// Compute normalization constant | |
if (kernel_type == 0) { | |
kernel_c = 0.5 * (static_cast<double>(dim) + 2.0) / 1.234; | |
// TODO: replace 1.234 with the volume of the d-dimensional unit | |
// sphere FIXME | |
} else if (kernel_type == 1) { | |
kernel_c = pow(2.0 * M_PI, -static_cast<double>(dim)/2.0); | |
} | |
} | |
void Meanshift::FindModes(const std::vector<std::vector<double> >& X, | |
std::vector<std::vector<double> >& modes, | |
std::vector<int>& indexmap, | |
unsigned int procedure_count) const | |
{ | |
//assert(indexmap.size() == X.size()); | |
indexmap.clear(); | |
indexmap.resize(X.size(), 0); | |
modes.clear(); | |
if (procedure_count != 0 && procedure_count >= X.size()) | |
procedure_count = 0; | |
// Perform dense mode finding: for each sample in X, perform the mean | |
// shift procedure | |
std::vector<double> mode(X[0].size()); | |
std::vector<unsigned int> visited(X.size()); | |
// [mode_index][sample_index] = number of times sample_index was visited | |
// when arriving at the mode with the mode_index. | |
std::vector<std::map<unsigned int, unsigned int> > visit_counts; | |
for (unsigned int si = 0; (procedure_count == 0 && si < X.size()) | |
|| (procedure_count != 0 && si < procedure_count); ++si) { | |
if (procedure_count == 0) { | |
gmm::copy(X[si], mode); | |
} else { | |
// Pick a random one from X | |
unsigned sample_index = rand() % X.size(); | |
gmm::copy(X[sample_index], mode); | |
} | |
std::cout << "Sample " << si << " of " | |
<< (procedure_count == 0 ? X.size() : procedure_count) | |
<< std::endl; | |
gmm::clear(visited); | |
MeanshiftProcedure(X, mode, visited); | |
// Identify whether this is a novel mode or a known one | |
bool found = false; | |
unsigned int M_cur = 0; | |
for (std::vector<std::vector<double> >::iterator Mi = | |
modes.begin(); found == false && Mi != modes.end(); ++Mi) { | |
if (gmm::vect_dist2(*Mi, mode) < mode_tolerance) { | |
M_cur = Mi - modes.begin(); | |
found = true; | |
} | |
} | |
if (found == false) { | |
// Add novel mode | |
modes.push_back(mode); | |
// TODO: remove this debug output | |
std::cout << modes.size() << " mode" | |
<< (modes.size() >= 2 ? "s" : "") << std::endl; | |
// Add a new mapping of which samples have been visited while | |
// approaching the novel mode. | |
std::map<unsigned int, unsigned int> mode_visits; | |
for (std::vector<unsigned int>::const_iterator vi = | |
visited.begin(); vi != visited.end(); ++vi) { | |
if (*vi != 0) | |
mode_visits[vi - visited.begin()] = 1; | |
} | |
visit_counts.push_back(mode_visits); | |
} else { | |
// The mode has been known, but we maybe crossed old and new | |
// samples. Update the counts. | |
for (std::vector<unsigned int>::const_iterator vi = | |
visited.begin(); vi != visited.end(); ++vi) { | |
if (*vi != 0) | |
visit_counts[M_cur][vi - visited.begin()] += 1; | |
} | |
} | |
} | |
#ifdef DEBUG | |
std::cout << "Found " << modes.size() << " modes." << std::endl; | |
#endif | |
// Perform index mapping: each sample gets assigned to one mode index. | |
unsigned int unmapped_count = 0; | |
for (unsigned int sample_index = 0; | |
sample_index < X.size(); ++sample_index) { | |
// Find mode index with highest count | |
unsigned int maximum_count = 0; | |
int maximum_mode_index = -1; | |
for (unsigned int mode_index = 0; | |
mode_index < modes.size(); ++mode_index) { | |
if (visit_counts[mode_index].count(sample_index) == 0) | |
continue; | |
unsigned int count_cur = visit_counts[mode_index][sample_index]; | |
std::cout << " mode " << mode_index << " visited " | |
<< "sample " << sample_index << " for " | |
<< count_cur << " times." << std::endl; | |
if (count_cur > maximum_count) { | |
maximum_mode_index = mode_index; | |
maximum_count = count_cur; | |
} | |
} | |
if (maximum_mode_index == -1) | |
unmapped_count += 1; | |
indexmap[sample_index] = maximum_mode_index; | |
} | |
std::cout << unmapped_count << " unmapped samples." << std::endl; | |
} | |
int Meanshift::MeanshiftProcedure(const std::vector<std::vector<double> >& X, | |
std::vector<double>& mode, std::vector<unsigned int>& visited) const { | |
assert(X.size() > 0); | |
assert(visited.size() == X.size()); | |
assert(X[0].size() == mode.size()); | |
std::vector<double> meanshift(mode.size(),0); | |
std::vector<double> meanshift_cur(mode.size(),0); | |
int iter = 0; | |
bool converged = false; | |
do { | |
// Compute the mean shift vector | |
gmm::clear(meanshift); | |
double denominator = 0.0; | |
for (std::vector<std::vector<double> >::const_iterator Xi = X.begin(); | |
Xi != X.end(); ++Xi) { | |
gmm::copy(mode, meanshift_cur); | |
gmm::add(gmm::scaled(*Xi, -1.0), meanshift_cur); | |
gmm::scale(meanshift_cur, 1.0 / kernel_bandwidth); | |
double weight_cur = gmm::vect_norm2(meanshift_cur); | |
if (kernel_type == 0) { | |
// Epanechnikov kernel is the shadow of the uniform kernel | |
if (weight_cur <= 1.0) | |
weight_cur = 1.0; | |
else | |
weight_cur = 0.0; | |
} else if (kernel_type == 1) { | |
// Multivariate normal kernel is the shadow of itself | |
weight_cur = kernel_c * exp(-0.5 * weight_cur); | |
// Truncate | |
if (weight_cur < 1e-2) | |
weight_cur = 0.0; | |
} | |
if (weight_cur >= 1e-6) | |
visited[Xi - X.begin()] = 1; | |
gmm::add(gmm::scaled(*Xi, weight_cur), meanshift); | |
denominator += weight_cur; | |
} | |
gmm::scale(meanshift, 1.0 / denominator); | |
double distance_moved = gmm::vect_dist2(meanshift, mode); | |
gmm::copy(meanshift, mode); | |
#ifdef DEBUG | |
std::cout << "iter " << iter << ", moved " | |
<< distance_moved << std::endl; | |
#endif | |
iter += 1; | |
if (distance_moved <= 1e-8) | |
converged = true; | |
} while (converged == false); | |
return (iter); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*========================================================================= | |
* | |
* Copyright David Doria 2012 [email protected] | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0.txt | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
* | |
*=========================================================================*/ | |
/** XMeans clustering is a method that creates K clusters of points from | |
* an unorganized set of input points. It is an extension of KMeans clustering | |
* that attempts to determine K during the algorithm. | |
*/ | |
#ifndef XMeansClustering_h | |
#define XMeansClustering_h | |
// STL | |
#include <vector> | |
#include <iostream> | |
#include <limits> | |
#include <numeric> | |
#include <set> | |
#include <stdexcept> | |
// Eigen | |
#include <Eigen/Dense> | |
namespace EigenHelpers{ | |
template <typename TVector> | |
TVector RandomUnitVector(const unsigned int dim) | |
{ | |
TVector randomUnitVector(dim); | |
for(int i = 0; i < randomUnitVector.size(); ++i) | |
{ | |
randomUnitVector[i] = (double(rand()) / RAND_MAX); | |
} | |
randomUnitVector.normalize(); | |
return randomUnitVector; | |
} | |
template <typename TPoint> | |
void GetBoundingBox(const std::vector<TPoint, Eigen::aligned_allocator<TPoint> >& data, TPoint& minCorner, TPoint& maxCorner) | |
{ | |
assert(data.size() > 0); | |
minCorner.resize(data[0].size()); | |
maxCorner.resize(data[0].size()); | |
for(int coordinate = 0; coordinate < data[0].size(); ++coordinate) | |
{ | |
minCorner[coordinate] = std::numeric_limits<typename TPoint::Scalar>::max(); | |
maxCorner[coordinate] = std::numeric_limits<typename TPoint::Scalar>::min(); | |
for(unsigned int pointId = 0; pointId < data.size(); ++pointId) | |
{ | |
if(data[pointId][coordinate] > maxCorner[coordinate]) | |
{ | |
maxCorner[coordinate] = data[pointId][coordinate]; | |
} | |
if(data[pointId][coordinate] < minCorner[coordinate]) | |
{ | |
minCorner[coordinate] = data[pointId][coordinate]; | |
} | |
} | |
} | |
} | |
} | |
class XMeansClustering | |
{ | |
public: | |
typedef Eigen::VectorXf PointType; | |
typedef std::vector<PointType, Eigen::aligned_allocator<PointType> > VectorOfPoints; | |
/** Constructor. */ | |
XMeansClustering(); | |
/** Set the maximum number of clusters to find. */ | |
void SetMaxK(const unsigned int maxk); | |
/** Get the maximum number of clusters to find. */ | |
unsigned int GetMaxK(); | |
/** Get the ids of the points that belong to class 'label'. */ | |
std::vector<unsigned int> GetIndicesWithLabel(const unsigned int label); | |
/** Get the coordinates of the points that belong to class 'label'. */ | |
VectorOfPoints GetPointsWithLabel(const unsigned int label); | |
/** Set the points to cluster. */ | |
void SetPoints(const VectorOfPoints& points); | |
/** Get the resulting cluster id for each point. */ | |
std::vector<unsigned int> GetLabels(); | |
/** Actually perform the clustering. */ | |
void Cluster(); | |
/** Write the cluster centers to the standard output. */ | |
void OutputClusterCenters(); | |
private: | |
/** Split every cluster into two clusters if that helps the description of the data. */ | |
void SplitClusters(); | |
/** The label (cluster ID) of each point. */ | |
std::vector<unsigned int> Labels; | |
/** The maximum number of clusters to find */ | |
unsigned int MaxK; | |
/** The points to cluster. */ | |
VectorOfPoints Points; | |
/** The current cluster centers. */ | |
VectorOfPoints ClusterCenters; | |
}; | |
XMeansClustering::XMeansClustering() : MaxK(3) | |
{ | |
} | |
void XMeansClustering::Cluster() | |
{ | |
if(this->Points.size() < this->MaxK) | |
{ | |
std::stringstream ss; | |
ss << "The number of points (" << this->Points.size() | |
<< " must be larger than the maximum number of clusters (" << this->MaxK << ")"; | |
throw std::runtime_error(ss.str()); | |
} | |
// We must store the labels at the previous iteration to determine whether any labels changed at each iteration. | |
std::vector<unsigned int> oldLabels(this->Points.size(), 0); // initialize to all zeros | |
// Initialize the labels array | |
this->Labels.resize(this->Points.size()); | |
do | |
{ | |
// Save the old labels | |
oldLabels = this->Labels; | |
}while(this->ClusterCenters.size() < this->MaxK); | |
} | |
void XMeansClustering::SplitClusters() | |
{ | |
assert(this->Points.size() > 0); | |
VectorOfPoints newClusterCenters; | |
for(unsigned int clusterId = 0; clusterId < this->ClusterCenters.size(); ++clusterId) | |
{ | |
// Generate a random direction | |
PointType randomUnitVector = EigenHelpers::RandomUnitVector<PointType>(this->Points[0].size()); | |
// Get the bounding box of the points that belong to this cluster | |
PointType minCorner; | |
PointType maxCorner; | |
EigenHelpers::GetBoundingBox(this->Points, minCorner, maxCorner); | |
// Scale the unit vector by the size of the region | |
PointType splitVector = randomUnitVector * (maxCorner - minCorner) / 2.0f; | |
PointType childCenter1 = this->ClusterCenters[clusterId] + splitVector; | |
PointType childCenter2 = this->ClusterCenters[clusterId] + splitVector; | |
// Compute the BIC of the original model | |
float BIC_parent; | |
// Compute the BIC of the new (split) model | |
float BIC_children; | |
// If the split was useful, keep it | |
if(BIC_children < BIC_parent) | |
{ | |
newClusterCenters.push_back(childCenter1); | |
newClusterCenters.push_back(childCenter2); | |
} | |
else | |
{ | |
newClusterCenters.push_back(this->ClusterCenters[clusterId]); | |
} | |
} | |
this->ClusterCenters = newClusterCenters; | |
} | |
std::vector<unsigned int> XMeansClustering::GetIndicesWithLabel(unsigned int label) | |
{ | |
std::vector<unsigned int> pointsWithLabel; | |
for(unsigned int i = 0; i < this->Labels.size(); i++) | |
{ | |
if(this->Labels[i] == label) | |
{ | |
pointsWithLabel.push_back(i); | |
} | |
} | |
return pointsWithLabel; | |
} | |
XMeansClustering::VectorOfPoints XMeansClustering::GetPointsWithLabel(const unsigned int label) | |
{ | |
VectorOfPoints points; | |
std::vector<unsigned int> indicesWithLabel = GetIndicesWithLabel(label); | |
for(unsigned int i = 0; i < indicesWithLabel.size(); i++) | |
{ | |
points.push_back(this->Points[indicesWithLabel[i]]); | |
} | |
return points; | |
} | |
void XMeansClustering::SetMaxK(const unsigned int maxk) | |
{ | |
this->MaxK = maxk; | |
} | |
unsigned int XMeansClustering::GetMaxK() | |
{ | |
return this->MaxK; | |
} | |
void XMeansClustering::SetPoints(const VectorOfPoints& points) | |
{ | |
this->Points = points; | |
} | |
std::vector<unsigned int> XMeansClustering::GetLabels() | |
{ | |
return this->Labels; | |
} | |
void XMeansClustering::OutputClusterCenters() | |
{ | |
std::cout << std::endl << "Cluster centers: " << std::endl; | |
for(unsigned int i = 0; i < ClusterCenters.size(); ++i) | |
{ | |
std::cout << ClusterCenters[i] << " "; | |
} | |
std::cout << std::endl; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment