Skip to content

Instantly share code, notes, and snippets.

@lkacenja
Last active October 20, 2016 19:48
Show Gist options
  • Save lkacenja/ede5a1608ed099e6141372e807f1d415 to your computer and use it in GitHub Desktop.
Save lkacenja/ede5a1608ed099e6141372e807f1d415 to your computer and use it in GitHub Desktop.
Exact port Simple Script's ckmeans algorithm (https://github.com/simple-statistics/simple-statistics) to PHP. Useful for creating choropleth breaks and such.
<?php
function makeMatrix($columns, $rows) {
$matrix = array();
for ($i = 0; $i < $columns; $i ++) {
$column = array();
for ($j = 0; $j < $rows; $j ++) {
$column[] = 0;
}
$matrix[] = $column;
}
return $matrix;
}
function uniqueCountSorted($input) {
$uniqueValueCount = 0;
$lastSeenValue = FALSE;
for ($i = 0; $i < count($input); $i++) {
if ($i === 0 || $input[$i] !== $lastSeenValue) {
$lastSeenValue = $input[$i];
$uniqueValueCount ++;
}
}
return $uniqueValueCount;
}
function ckmeans($data, $nClusters) {
if ($nClusters > count($data)) {
throw Exception('Cannot generate more classes than there are data values');
}
sort($data);
$sorted = $data;
$uniqueCount = uniqueCountSorted($sorted);
$matrix = makeMatrix($nClusters, count($data));
$backtrackMatrix = makeMatrix($nClusters, count($data));
for ($cluster = 0; $cluster < $nClusters; $cluster ++) {
$firstClusterMean = $sorted[0];
for ($sortedIdx = max($cluster, 1); $sortedIdx < count($sorted); $sortedIdx ++) {
if ($cluster === 0) {
$squaredDifference = pow($sorted[$sortedIdx] - $firstClusterMean, 2);
$matrix[$cluster][$sortedIdx] = $matrix[$cluster][$sortedIdx - 1] + ($sortedIdx / ($sortedIdx + 1)) * $squaredDifference;
$newSum = $sortedIdx * $firstClusterMean + $sorted[$sortedIdx];
$firstClusterMean = $newSum / ($sortedIdx + 1);
}
else {
$sumSquaredDistances = 0;
$meanXJ = 0;
for ($j = $sortedIdx; $j >= $cluster; $j --) {
$sumSquaredDistances += ($sortedIdx - $j) / ($sortedIdx - $j + 1) * pow($sorted[$j] - $meanXJ, 2);
$meanXJ = ($sorted[$j] + ($sortedIdx - $j) * $meanXJ) / ($sortedIdx - $j + 1);
if ($j === $sortedIdx) {
$matrix[$cluster][$sortedIdx] = $sumSquaredDistances;
$backtrackMatrix[$cluster][$sortedIdx] = $j;
if ($j > 0) {
$matrix[$cluster][$sortedIdx] += $matrix[$cluster - 1][$j - 1];
}
}
else {
if ($j === 0) {
if ($sumSquaredDistances <= $matrix[$cluster][$sortedIdx]) {
$matrix[$cluster][$sortedIdx] = $sumSquaredDistances;
$backtrackMatrix[$cluster][$sortedIdx] = $j;
}
}
else if ($sumSquaredDistances + $matrix[$cluster - 1][$j - 1] < $matrix[$cluster][$sortedIdx]) {
$matrix[$cluster][$sortedIdx] = $sumSquaredDistances + $matrix[$cluster - 1][$j - 1];
$backtrackMatrix[$cluster][$sortedIdx] = $j;
}
}
}
}
}
}
$clusters = array();
$clusterRight = count($backtrackMatrix[0]) - 1;
for ($cluster = count($backtrackMatrix) - 1; $cluster >= 0; $cluster --) {
$clusterLeft = $backtrackMatrix[$cluster][$clusterRight];
$clusters[$cluster] = array_slice($sorted, $clusterLeft, ($clusterRight + 1) - $clusterLeft);
if ($cluster > 0) {
$clusterRight = $clusterLeft - 1;
}
}
return $clusters;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment