Skip to content

Instantly share code, notes, and snippets.

@kei-sato
Last active November 9, 2015 18:18
Show Gist options
  • Save kei-sato/8b8f644e94e98fcddba9 to your computer and use it in GitHub Desktop.
Save kei-sato/8b8f644e94e98fcddba9 to your computer and use it in GitHub Desktop.
Clustering with Lance-Williams updating formula
# Clustering with Lance-Williams updating formula
# http://ibisforest.org/index.php?Lance-Williams%20updating%20formula
calcSquare = (n) -> n * n
haversine = (lat1, lng1, lat2, lng2) ->
6371000 * 2 * Math.asin(Math.sqrt(Math.pow(Math.sin((lat1 - Math.abs(lat2)) * Math.PI / 180 / 2), 2) + Math.cos(Math.abs(lat1) * Math.PI / 180 ) * Math.cos(Math.abs(lat2) * Math.PI / 180) * Math.pow(Math.sin((lng1 - lng2) * Math.PI / 180 / 2), 2) ))
##
# calculate distance between 2 numbers
# @param a number1
# @param b number2
# @return distance (0 <= distance)
distanceNum = (a, b) ->
calcSquare a - b
##
# calculate distance between 2 points
# @param a = { lat: 12.345, lng: 123.45 }
# @param b = { lat: 12.345, lng: 123.45 }
# @return distance (0 <= distance)
distanceLatlng = (a, b) ->
center = {
lat: (a.lat + b.lat) / 2
lng: (a.lng + b.lng) / 2
}
calcSquare(haversine(a.lat, a.lng, center.lat, center.lng)) + calcSquare(haversine(b.lat, b.lng, center.lat, center.lng))
##
# calculate levenshtein distance between 2 strings
# @param a string1
# @param b string2
# @return distance (0 <= distance <= 1)
levenshtein = (a, b) ->
unless a.length && b.length
return a.length || b.length
m = [0 for j in [0..b.length] for i in [0..a.length]][0]
for i in [0..a.length]
m[i][0] = i
for j in [0..b.length]
m[0][j] = j
for i in [1..a.length]
for j in [1..b.length]
if a[i - 1] == b[j - 1]
x = 0
else
x = 1
m[i][j] = Math.min(m[i - 1][j] + 1, m[i][ j - 1] + 1, m[i - 1][j - 1] + x)
d = m[m.length-1][m[0].length-1]
d = d / Math.max(a.length, b.length)
return d
##
# Clustering with Ward method
# @param dataset dataset to clustering
# @param opt = {
# maxDst : threshold of distance
# minClst: threshold of number of clusters
# formula: update formula
# debug : get verbose
# }
# @return clusters = [ [val1, val2, …], [val3, val4, …], … ]
doClustering = (dataset, opt) ->
opt = opt || {}
maxDst = opt.maxDst || Number.MAX_VALUE
minClst = opt.minClst || 1
formula = opt.formula || updateMatrixWard
debug = opt.debug || false
# clusters contain only one item on the first step
clsts = ([item] for item in dataset)
# create distance matrix
matrix = distanceMatrix(dataset, opt)
loop
# break if cluster length is less than the threshold
break if clsts.length <= opt.minClst
# detect minimum distance and their indices
minDst = Number.MAX_VALUE
len = matrix.length
for i in [0..(len - 2)]
for j in [(i + 1)..(len - 1)]
if matrix[i][j] < minDst
minDst = matrix[i][j]
ia = i
ib = j
# break if smallest distance is less than the threshold
break if minDst > maxDst
# update matrix
matrix = formula()(matrix, clsts, ia, ib)
# concat nearest neighbors
concated = clsts[ia].concat clsts[ib]
clsts.splice ib, 1
clsts.splice ia, 1, concated
clsts
##
# create distance matrix
# @param dataset = […]
# @return distance matrix = [[…],[…], …]
#
distanceMatrix = (dataset, opt) ->
opt = opt || {}
distance = opt.distance || () -> Number.MAX_VALUE
key = opt.key
dsts = []
i = 0
while i < dataset.length
dsts[i] = []
dsts[i][i] = 0
j = i + 1
while j < dataset.length
a = if key then dataset[i][key] else dataset[i]
b = if key then dataset[j][key] else dataset[j]
dsts[i][j] = distance(a, b)
j++
i++
dsts
updateMatrixSimpleLink = () ->
alphaA = () -> 1/2
alphaB = () -> 1/2
beta = () -> 0
gamma = () -> -1/2
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixCompleteLink = () ->
alphaA = () -> 1/2
alphaB = () -> 1/2
beta = () -> 0
gamma = () -> 1/2
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixGroupAve = () ->
alphaA = (n1, n1a, n1b, n2) -> n1a / n1
alphaB = (n1, n1a, n1b, n2) -> n1b / n1
beta = () -> 0
gamma = () -> 0
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixWPGMA = () ->
alphaA = () -> 1/2
alphaB = () -> 1/2
beta = () -> 0
gamma = () -> 0
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixWard = () ->
alphaA = (n1, n1a, n1b, n2) -> (n1a + n2) / (n1 + n2)
alphaB = (n1, n1a, n1b, n2) -> (n1b + n2) / (n1 + n2)
beta = (n1, n1a, n1b, n2) -> -n2 / (n1 + n2)
gamma = () -> 0
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixCentroid = () ->
alphaA = (n1, n1a, n1b, n2) -> n1a / n1
alphaB = (n1, n1a, n1b, n2) -> n1b / n1
beta = (n1, n1a, n1b, n2) -> -n1a*n1b / n1*n1
gamma = () -> 0
updateMatrix alphaA, alphaB, beta, gamma
updateMatrixMedian = () ->
alphaA = () -> 1/2
alphaB = () -> 1/2
beta = () -> -1/4
gamma = () -> 0
updateMatrix alphaA, alphaB, beta, gamma
updateMatrix = (alphaA, alphaB, beta, gamma) ->
alphaA = alphaA || () -> 0
alphaB = alphaB || () -> 0
beta = beta || () -> 0
gamma = gamma || () -> 0
(matrix1, clusters, i1a, i1b) ->
c1a = clusters[i1a]
c1b = clusters[i1b]
n1a = c1a.length
n1b = c1b.length
n1 = n1a + n1b
matrix2 = []
getDist = (mtx, i1, i2) ->
mtx[Math.min(i1,i2)][Math.max(i1,i2)]
d1a1b = matrix1[i1a][i1b]
for dists1, i in matrix1
continue if i is i1b
if i is i1a
dists2 = []
for d, j in dists1
continue if j is i1b
n2 = clusters[j].length
ala = alphaA n1, n1a, n1b, n2
alb = alphaB n1, n1a, n1b, n2
bt = beta n1, n1a, n1b, n2
gm = gamma n1, n1a, n1b, n2
d1a2 = getDist matrix1, i1a, j
d1b2 = getDist matrix1, i1b, j
dist = ala*d1a2 + alb*d1b2 + bt*d1a1b + gm*Math.abs(d1a2-d1b2)
dists2.push dist
matrix2.push dists2
else
dists2 = (d for d in dists1)
dists2.splice i1b, 1
matrix2.push dists2
matrix2
module.exports =
doClustering : doClustering
distanceNum : distanceNum
distanceLatlng : distanceLatlng
levenshtein : levenshtein
updateMatrixSimpleLink : updateMatrixSimpleLink
updateMatrixCompleteLink: updateMatrixCompleteLink
updateMatrixGroupAve : updateMatrixGroupAve
updateMatrixWPGMA : updateMatrixWPGMA
updateMatrixWard : updateMatrixWard
updateMatrixCentroid : updateMatrixCentroid
updateMatrixMedian : updateMatrixMedian
updateMatrix : updateMatrix
fs = require "fs"
# compile clustering-lance-williams.coffee to node_modules/clustering/index.js
Clustering = require "clustering"
NCLST_DEFAULT = 10
DPATH_DEFAULT = '/tmp/dataset.txt'
nclst = process.argv[2] || NCLST_DEFAULT
dpath = process.argv[3] || DPATH_DEFAULT
fs.readFile dpath, 'utf8', (err, data) ->
throw err if err
lines = data.split "\n"
dataset = [ { id:i, val:line } for line, i in lines ][0]
dataset = dataset.filter (line) -> line.val
# console.log "dataset", dataset.length
clusters = Clustering.doClustering dataset,
minClst : nclst
distance: Clustering.levenshtein
formula : Clustering.updateMatrixWard
key: "val"
console.log "clusters", clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment