Last active
August 29, 2015 14:03
-
-
Save matovitch/178fc3d59be942274d0e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef __KMEANS_H__ | |
#define __KMEANS_H__ | |
#include <ctime> | |
#include <vector> | |
#include <limits> | |
#include <cstddef> | |
namespace kmeans | |
{ | |
template <typename Distance, typename Hasher> | |
struct Algorithm | |
{ | |
typedef typename std::vector<typename Distance::argument_type>::iterator clusters_iterator; | |
std::vector<typename Distance::argument_type> m_clusters; | |
struct History | |
{ | |
typename Distance::argument_type* m_oldCluster; | |
typename Distance::argument_type m_newCluster; | |
std::size_t m_nPoints; | |
std::size_t m_hash; | |
History() : m_hash(0), m_nPoints(1) {} | |
}; | |
template <typename PointSetIterator> | |
void operator()(const PointSetIterator& pointsBegin, | |
const PointSetIterator& pointsEnd, std::size_t nClusters) | |
{ | |
HistorySet histories(nClusters); | |
std::size_t oldClustersHash = 0; | |
std::size_t newClustersHash = 1; | |
initClusters(pointsBegin, pointsEnd, nClusters, histories); | |
while (newClustersHash != oldClustersHash) | |
{ | |
oldClustersHash = newClustersHash; | |
for(PointSetIterator pointSetIterator = pointsBegin; | |
pointSetIterator != pointsEnd; | |
pointSetIterator++) | |
{ | |
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max(); | |
HistorySetIterator nearestCluster; | |
for (HistorySetIterator it = histories.begin(); it != histories.end(); it++) | |
{ | |
const DistanceType clusterDistance = | |
m_distance(*pointSetIterator, *(it->m_oldCluster)); | |
if (clusterDistance < minClusterDistance) | |
{ | |
minClusterDistance = clusterDistance; | |
nearestCluster = it; | |
} | |
} | |
nearestCluster->m_hash ^= m_hasher(*pointSetIterator); | |
nearestCluster->m_newCluster += *pointSetIterator; | |
nearestCluster->m_nPoints++; | |
} | |
newClustersHash = 0; | |
for (HistorySetIterator it = histories.begin(); it != histories.end(); it++) | |
{ | |
if (it->m_nPoints == 0) | |
{ | |
*(it->m_oldCluster) = *pointsBegin; | |
} | |
else | |
{ | |
it->m_newCluster /= static_cast<FieldType>(it->m_nPoints); | |
*(it->m_oldCluster) = it->m_newCluster; | |
} | |
newClustersHash += it->m_hash; | |
it->m_nPoints = 1; | |
it->m_hash = 0; | |
} | |
} | |
} | |
private: | |
typedef std::vector<History> HistorySet; | |
typedef typename Distance::return_type DistanceType; | |
typedef typename HistorySet::iterator HistorySetIterator; | |
typedef typename Distance::argument_type::field_type FieldType; | |
Distance m_distance; | |
Hasher m_hasher; | |
template <typename PointSetIterator> | |
void initClusters(const PointSetIterator& pointsBegin, | |
const PointSetIterator& pointsEnd, | |
std::size_t nClusters, | |
HistorySet& histories) | |
{ | |
m_clusters.resize(nClusters); | |
m_clusters[0] = *pointsBegin; | |
for (std::size_t i = 1; i < nClusters; i++) | |
{ | |
const DistanceType cost = computeCost(pointsBegin, pointsEnd, i); | |
PointSetIterator it = pointsBegin; | |
PointSetIterator newCluster = pointsEnd; | |
while (newCluster == pointsEnd) | |
{ | |
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max(); | |
for (std::size_t j = 0; j < i; j++) | |
{ | |
const DistanceType clusterDistance = | |
m_distance(*it, m_clusters[j]); | |
if (clusterDistance < minClusterDistance) | |
{ | |
minClusterDistance = clusterDistance; | |
} | |
} | |
if (minClusterDistance * static_cast<FieldType>(RAND_MAX) > | |
static_cast<FieldType>(rand()) * cost) | |
{ | |
newCluster = it; | |
} | |
it = (it + 1 != pointsEnd) ? it + 1 : pointsBegin; | |
} | |
m_clusters[i] = *newCluster; | |
} | |
clusters_iterator clusterSetIterator = m_clusters.begin(); | |
std::vector<History>::iterator historySetIterator = histories.begin(); | |
while (clusterSetIterator != m_clusters.end()) | |
{ | |
historySetIterator->m_oldCluster = &(*clusterSetIterator); | |
historySetIterator->m_newCluster = *clusterSetIterator; | |
historySetIterator++; | |
clusterSetIterator++; | |
} | |
} | |
template <typename PointSetIterator> | |
DistanceType computeCost(const PointSetIterator& pointsBegin, | |
const PointSetIterator& pointsEnd, std::size_t nClusters) | |
{ | |
DistanceType cost = static_cast<DistanceType>(0); | |
for(PointSetIterator pointSetIterator = pointsBegin; | |
pointSetIterator != pointsEnd; | |
pointSetIterator++) | |
{ | |
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max(); | |
for (std::size_t j = 0; j < nClusters; j++) | |
{ | |
const DistanceType clusterDistance = | |
m_distance(*pointSetIterator, m_clusters[j]); | |
if (clusterDistance < minClusterDistance) | |
{ | |
minClusterDistance = clusterDistance; | |
} | |
} | |
cost += minClusterDistance; | |
} | |
return cost; | |
} | |
}; | |
} // end kmeans namespace | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment