Last active
November 9, 2015 18:18
-
-
Save kei-sato/8b8f644e94e98fcddba9 to your computer and use it in GitHub Desktop.
Clustering with Lance-Williams updating formula
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clustering with Lance-Williams updating formula | |
# http://ibisforest.org/index.php?Lance-Williams%20updating%20formula | |
calcSquare = (n) -> n * n | |
haversine = (lat1, lng1, lat2, lng2) -> | |
6371000 * 2 * Math.asin(Math.sqrt(Math.pow(Math.sin((lat1 - Math.abs(lat2)) * Math.PI / 180 / 2), 2) + Math.cos(Math.abs(lat1) * Math.PI / 180 ) * Math.cos(Math.abs(lat2) * Math.PI / 180) * Math.pow(Math.sin((lng1 - lng2) * Math.PI / 180 / 2), 2) )) | |
## | |
# calculate distance between 2 numbers | |
# @param a number1 | |
# @param b number2 | |
# @return distance (0 <= distance) | |
distanceNum = (a, b) -> | |
calcSquare a - b | |
## | |
# calculate distance between 2 points | |
# @param a = { lat: 12.345, lng: 123.45 } | |
# @param b = { lat: 12.345, lng: 123.45 } | |
# @return distance (0 <= distance) | |
distanceLatlng = (a, b) -> | |
center = { | |
lat: (a.lat + b.lat) / 2 | |
lng: (a.lng + b.lng) / 2 | |
} | |
calcSquare(haversine(a.lat, a.lng, center.lat, center.lng)) + calcSquare(haversine(b.lat, b.lng, center.lat, center.lng)) | |
## | |
# calculate levenshtein distance between 2 strings | |
# @param a string1 | |
# @param b string2 | |
# @return distance (0 <= distance <= 1) | |
levenshtein = (a, b) -> | |
unless a.length && b.length | |
return a.length || b.length | |
m = [0 for j in [0..b.length] for i in [0..a.length]][0] | |
for i in [0..a.length] | |
m[i][0] = i | |
for j in [0..b.length] | |
m[0][j] = j | |
for i in [1..a.length] | |
for j in [1..b.length] | |
if a[i - 1] == b[j - 1] | |
x = 0 | |
else | |
x = 1 | |
m[i][j] = Math.min(m[i - 1][j] + 1, m[i][ j - 1] + 1, m[i - 1][j - 1] + x) | |
d = m[m.length-1][m[0].length-1] | |
d = d / Math.max(a.length, b.length) | |
return d | |
## | |
# Clustering with Ward method | |
# @param dataset dataset to clustering | |
# @param opt = { | |
# maxDst : threshold of distance | |
# minClst: threshold of number of clusters | |
# formula: update formula | |
# debug : get verbose | |
# } | |
# @return clusters = [ [val1, val2, …], [val3, val4, …], … ] | |
doClustering = (dataset, opt) -> | |
opt = opt || {} | |
maxDst = opt.maxDst || Number.MAX_VALUE | |
minClst = opt.minClst || 1 | |
formula = opt.formula || updateMatrixWard | |
debug = opt.debug || false | |
# clusters contain only one item on the first step | |
clsts = ([item] for item in dataset) | |
# create distance matrix | |
matrix = distanceMatrix(dataset, opt) | |
loop | |
# break if cluster length is less than the threshold | |
break if clsts.length <= opt.minClst | |
# detect minimum distance and their indices | |
minDst = Number.MAX_VALUE | |
len = matrix.length | |
for i in [0..(len - 2)] | |
for j in [(i + 1)..(len - 1)] | |
if matrix[i][j] < minDst | |
minDst = matrix[i][j] | |
ia = i | |
ib = j | |
# break if smallest distance is less than the threshold | |
break if minDst > maxDst | |
# update matrix | |
matrix = formula()(matrix, clsts, ia, ib) | |
# concat nearest neighbors | |
concated = clsts[ia].concat clsts[ib] | |
clsts.splice ib, 1 | |
clsts.splice ia, 1, concated | |
clsts | |
## | |
# create distance matrix | |
# @param dataset = […] | |
# @return distance matrix = [[…],[…], …] | |
# | |
distanceMatrix = (dataset, opt) -> | |
opt = opt || {} | |
distance = opt.distance || () -> Number.MAX_VALUE | |
key = opt.key | |
dsts = [] | |
i = 0 | |
while i < dataset.length | |
dsts[i] = [] | |
dsts[i][i] = 0 | |
j = i + 1 | |
while j < dataset.length | |
a = if key then dataset[i][key] else dataset[i] | |
b = if key then dataset[j][key] else dataset[j] | |
dsts[i][j] = distance(a, b) | |
j++ | |
i++ | |
dsts | |
updateMatrixSimpleLink = () -> | |
alphaA = () -> 1/2 | |
alphaB = () -> 1/2 | |
beta = () -> 0 | |
gamma = () -> -1/2 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixCompleteLink = () -> | |
alphaA = () -> 1/2 | |
alphaB = () -> 1/2 | |
beta = () -> 0 | |
gamma = () -> 1/2 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixGroupAve = () -> | |
alphaA = (n1, n1a, n1b, n2) -> n1a / n1 | |
alphaB = (n1, n1a, n1b, n2) -> n1b / n1 | |
beta = () -> 0 | |
gamma = () -> 0 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixWPGMA = () -> | |
alphaA = () -> 1/2 | |
alphaB = () -> 1/2 | |
beta = () -> 0 | |
gamma = () -> 0 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixWard = () -> | |
alphaA = (n1, n1a, n1b, n2) -> (n1a + n2) / (n1 + n2) | |
alphaB = (n1, n1a, n1b, n2) -> (n1b + n2) / (n1 + n2) | |
beta = (n1, n1a, n1b, n2) -> -n2 / (n1 + n2) | |
gamma = () -> 0 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixCentroid = () -> | |
alphaA = (n1, n1a, n1b, n2) -> n1a / n1 | |
alphaB = (n1, n1a, n1b, n2) -> n1b / n1 | |
beta = (n1, n1a, n1b, n2) -> -n1a*n1b / n1*n1 | |
gamma = () -> 0 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrixMedian = () -> | |
alphaA = () -> 1/2 | |
alphaB = () -> 1/2 | |
beta = () -> -1/4 | |
gamma = () -> 0 | |
updateMatrix alphaA, alphaB, beta, gamma | |
updateMatrix = (alphaA, alphaB, beta, gamma) -> | |
alphaA = alphaA || () -> 0 | |
alphaB = alphaB || () -> 0 | |
beta = beta || () -> 0 | |
gamma = gamma || () -> 0 | |
(matrix1, clusters, i1a, i1b) -> | |
c1a = clusters[i1a] | |
c1b = clusters[i1b] | |
n1a = c1a.length | |
n1b = c1b.length | |
n1 = n1a + n1b | |
matrix2 = [] | |
getDist = (mtx, i1, i2) -> | |
mtx[Math.min(i1,i2)][Math.max(i1,i2)] | |
d1a1b = matrix1[i1a][i1b] | |
for dists1, i in matrix1 | |
continue if i is i1b | |
if i is i1a | |
dists2 = [] | |
for d, j in dists1 | |
continue if j is i1b | |
n2 = clusters[j].length | |
ala = alphaA n1, n1a, n1b, n2 | |
alb = alphaB n1, n1a, n1b, n2 | |
bt = beta n1, n1a, n1b, n2 | |
gm = gamma n1, n1a, n1b, n2 | |
d1a2 = getDist matrix1, i1a, j | |
d1b2 = getDist matrix1, i1b, j | |
dist = ala*d1a2 + alb*d1b2 + bt*d1a1b + gm*Math.abs(d1a2-d1b2) | |
dists2.push dist | |
matrix2.push dists2 | |
else | |
dists2 = (d for d in dists1) | |
dists2.splice i1b, 1 | |
matrix2.push dists2 | |
matrix2 | |
module.exports = | |
doClustering : doClustering | |
distanceNum : distanceNum | |
distanceLatlng : distanceLatlng | |
levenshtein : levenshtein | |
updateMatrixSimpleLink : updateMatrixSimpleLink | |
updateMatrixCompleteLink: updateMatrixCompleteLink | |
updateMatrixGroupAve : updateMatrixGroupAve | |
updateMatrixWPGMA : updateMatrixWPGMA | |
updateMatrixWard : updateMatrixWard | |
updateMatrixCentroid : updateMatrixCentroid | |
updateMatrixMedian : updateMatrixMedian | |
updateMatrix : updateMatrix |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fs = require "fs" | |
# compile clustering-lance-williams.coffee to node_modules/clustering/index.js | |
Clustering = require "clustering" | |
NCLST_DEFAULT = 10 | |
DPATH_DEFAULT = '/tmp/dataset.txt' | |
nclst = process.argv[2] || NCLST_DEFAULT | |
dpath = process.argv[3] || DPATH_DEFAULT | |
fs.readFile dpath, 'utf8', (err, data) -> | |
throw err if err | |
lines = data.split "\n" | |
dataset = [ { id:i, val:line } for line, i in lines ][0] | |
dataset = dataset.filter (line) -> line.val | |
# console.log "dataset", dataset.length | |
clusters = Clustering.doClustering dataset, | |
minClst : nclst | |
distance: Clustering.levenshtein | |
formula : Clustering.updateMatrixWard | |
key: "val" | |
console.log "clusters", clusters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment