kei-sato · November 9, 2015 18:18
diff --git a/clustering-lance-williams.coffee b/clustering-lance-williams.coffee
 # Clustering with Lance-Williams updating formula
 # http://ibisforest.org/index.php?Lance-Williams%20updating%20formula

 calcSquare = (n) -> n * n

 haversine = (lat1, lng1, lat2, lng2) ->
  6371000 * 2 * Math.asin(Math.sqrt(Math.pow(Math.sin((lat1 - Math.abs(lat2)) * Math.PI / 180 / 2), 2) + Math.cos(Math.abs(lat1) * Math.PI / 180 ) * Math.cos(Math.abs(lat2) * Math.PI / 180) * Math.pow(Math.sin((lng1 - lng2) * Math.PI / 180 / 2), 2) ))

 ##
 # calculate distance between 2 numbers
 # @param a number1
 # @param b number2
 # @return distance (0 <= distance)
 distanceNum = (a, b) ->
  calcSquare a - b

 ##
 # calculate distance between 2 points
 # @param a = { lat: 12.345, lng: 123.45 }
 # @param b = { lat: 12.345, lng: 123.45 }
 # @return distance (0 <= distance)
 distanceLatlng = (a, b) ->
  center = {
    lat: (a.lat + b.lat) / 2
    lng: (a.lng + b.lng) / 2
  }

  calcSquare(haversine(a.lat, a.lng, center.lat, center.lng)) + calcSquare(haversine(b.lat, b.lng, center.lat, center.lng))

 ##
 # calculate levenshtein distance between 2 strings
 # @param a string1
 # @param b string2
 # @return distance (0 <= distance <= 1)
 levenshtein = (a, b) ->

  unless a.length && b.length
    return a.length || b.length

  m = [0 for j in [0..b.length] for i in [0..a.length]][0]

  for i in [0..a.length]
      m[i][0] = i

  for j in [0..b.length]
      m[0][j] = j

  for i in [1..a.length]
    for j in [1..b.length]
      if a[i - 1] == b[j - 1]
        x = 0
      else
        x = 1
      m[i][j] = Math.min(m[i - 1][j] + 1, m[i][ j - 1] + 1, m[i - 1][j - 1] + x)

  d = m[m.length-1][m[0].length-1]
  d = d / Math.max(a.length, b.length)
  return d

 ##
 # Clustering with Ward method
 # @param dataset dataset to clustering
 # @param opt = {
 #   maxDst : threshold of distance
 #   minClst: threshold of number of clusters
 #   formula: update formula
 #   debug  : get verbose
 # }
 # @return clusters = [ [val1, val2, …], [val3, val4, …], … ]
 doClustering = (dataset, opt) ->

  opt = opt || {}
  maxDst = opt.maxDst || Number.MAX_VALUE
  minClst = opt.minClst || 1
  formula = opt.formula || updateMatrixWard
  debug = opt.debug || false

  # clusters contain only one item on the first step
  clsts = ([item] for item in dataset)

  # create distance matrix
  matrix = distanceMatrix(dataset, opt)
  loop
    # break if cluster length is less than the threshold
    break if clsts.length <= opt.minClst

    # detect minimum distance and their indices
    minDst = Number.MAX_VALUE
    len = matrix.length
    for i in [0..(len - 2)]
      for j in [(i + 1)..(len - 1)]
        if matrix[i][j] < minDst
          minDst = matrix[i][j]
          ia = i
          ib = j

    # break if smallest distance is less than the threshold
    break if minDst > maxDst
      
    # update matrix
    matrix = formula()(matrix, clsts, ia, ib)

    # concat nearest neighbors
    concated = clsts[ia].concat clsts[ib]
    clsts.splice ib, 1
    clsts.splice ia, 1, concated

  clsts

 ##
 # create distance matrix
 # @param  dataset = […]
 # @return distance matrix = [[…],[…], …]
 #
 distanceMatrix = (dataset, opt) ->
  opt = opt || {}
  distance = opt.distance || () -> Number.MAX_VALUE
  key = opt.key

  dsts = []
  i = 0
  while i < dataset.length
    dsts[i] = []
    dsts[i][i] = 0
    j = i + 1
    while j < dataset.length
      a = if key then dataset[i][key] else dataset[i]
      b = if key then dataset[j][key] else dataset[j]
      dsts[i][j] = distance(a, b)
      j++
    i++
  dsts

 updateMatrixSimpleLink = () ->
  alphaA = () -> 1/2
  alphaB = () -> 1/2
  beta   = () -> 0
  gamma  = () -> -1/2
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixCompleteLink = () ->
  alphaA = () -> 1/2
  alphaB = () -> 1/2
  beta   = () -> 0
  gamma  = () -> 1/2
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixGroupAve = () ->
  alphaA = (n1, n1a, n1b, n2) -> n1a / n1
  alphaB = (n1, n1a, n1b, n2) -> n1b / n1
  beta   = () -> 0
  gamma  = () -> 0
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixWPGMA = () ->
  alphaA = () -> 1/2
  alphaB = () -> 1/2
  beta   = () -> 0
  gamma  = () -> 0
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixWard = () ->
  alphaA = (n1, n1a, n1b, n2) -> (n1a + n2) / (n1 + n2)
  alphaB = (n1, n1a, n1b, n2) -> (n1b + n2) / (n1 + n2)
  beta   = (n1, n1a, n1b, n2) -> -n2 / (n1 + n2)
  gamma  = () -> 0
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixCentroid = () ->
  alphaA = (n1, n1a, n1b, n2) -> n1a / n1
  alphaB = (n1, n1a, n1b, n2) -> n1b / n1
  beta   = (n1, n1a, n1b, n2) -> -n1a*n1b / n1*n1
  gamma  = () -> 0
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrixMedian = () ->
  alphaA = () -> 1/2
  alphaB = () -> 1/2
  beta   = () -> -1/4
  gamma  = () -> 0
  updateMatrix alphaA, alphaB, beta, gamma

 updateMatrix = (alphaA, alphaB, beta, gamma) ->
  alphaA = alphaA || () -> 0
  alphaB = alphaB || () -> 0
  beta   = beta   || () -> 0
  gamma  = gamma  || () -> 0

  (matrix1, clusters, i1a, i1b) ->
    c1a = clusters[i1a]
    c1b = clusters[i1b]
    n1a = c1a.length
    n1b = c1b.length
    n1 = n1a + n1b
    matrix2 = []

    getDist = (mtx, i1, i2) ->
      mtx[Math.min(i1,i2)][Math.max(i1,i2)]

    d1a1b = matrix1[i1a][i1b]

    for dists1, i in matrix1
      continue if i is i1b
      if i is i1a
        dists2 = []
        for d, j in dists1
          continue if j is i1b

          n2 = clusters[j].length
          ala = alphaA n1, n1a, n1b, n2
          alb = alphaB n1, n1a, n1b, n2
          bt = beta n1, n1a, n1b, n2
          gm = gamma n1, n1a, n1b, n2
          d1a2 = getDist matrix1, i1a, j
          d1b2 = getDist matrix1, i1b, j
          dist = ala*d1a2 + alb*d1b2 + bt*d1a1b + gm*Math.abs(d1a2-d1b2)
          dists2.push dist
        matrix2.push dists2
      else
        dists2 = (d for d in dists1)
        dists2.splice i1b, 1
        matrix2.push dists2
    matrix2

 module.exports =
  doClustering            : doClustering
  distanceNum             : distanceNum
  distanceLatlng          : distanceLatlng
  levenshtein             : levenshtein
  updateMatrixSimpleLink  : updateMatrixSimpleLink
  updateMatrixCompleteLink: updateMatrixCompleteLink
  updateMatrixGroupAve    : updateMatrixGroupAve
  updateMatrixWPGMA       : updateMatrixWPGMA
  updateMatrixWard        : updateMatrixWard
  updateMatrixCentroid    : updateMatrixCentroid
  updateMatrixMedian      : updateMatrixMedian
  updateMatrix            : updateMatrix
diff --git a/use-clustering.coffee b/use-clustering.coffee
 fs = require "fs"

 # compile clustering-lance-williams.coffee to node_modules/clustering/index.js
 Clustering = require "clustering"

 NCLST_DEFAULT = 10
 DPATH_DEFAULT = '/tmp/dataset.txt'
 nclst = process.argv[2] || NCLST_DEFAULT
 dpath = process.argv[3] || DPATH_DEFAULT

 fs.readFile dpath, 'utf8', (err, data) ->
  throw err if err
  lines = data.split "\n"
  dataset = [ { id:i, val:line } for line, i in lines ][0]
  dataset = dataset.filter (line) -> line.val
  # console.log "dataset", dataset.length

  clusters = Clustering.doClustering dataset,
    minClst : nclst
    distance: Clustering.levenshtein
    formula : Clustering.updateMatrixWard
    key: "val"

  console.log "clusters", clusters
	# Clustering with Lance-Williams updating formula
	# http://ibisforest.org/index.php?Lance-Williams%20updating%20formula

	calcSquare = (n) -> n * n

	haversine = (lat1, lng1, lat2, lng2) ->
	6371000 * 2 * Math.asin(Math.sqrt(Math.pow(Math.sin((lat1 - Math.abs(lat2)) * Math.PI / 180 / 2), 2) + Math.cos(Math.abs(lat1) * Math.PI / 180 ) * Math.cos(Math.abs(lat2) * Math.PI / 180) * Math.pow(Math.sin((lng1 - lng2) * Math.PI / 180 / 2), 2) ))

	##
	# calculate distance between 2 numbers
	# @param a number1
	# @param b number2
	# @return distance (0 <= distance)
	distanceNum = (a, b) ->
	calcSquare a - b

	##
	# calculate distance between 2 points
	# @param a = { lat: 12.345, lng: 123.45 }
	# @param b = { lat: 12.345, lng: 123.45 }
	# @return distance (0 <= distance)
	distanceLatlng = (a, b) ->
	center = {
	lat: (a.lat + b.lat) / 2
	lng: (a.lng + b.lng) / 2
	}

	calcSquare(haversine(a.lat, a.lng, center.lat, center.lng)) + calcSquare(haversine(b.lat, b.lng, center.lat, center.lng))

	##
	# calculate levenshtein distance between 2 strings
	# @param a string1
	# @param b string2
	# @return distance (0 <= distance <= 1)
	levenshtein = (a, b) ->

	unless a.length && b.length
	return a.length \|\| b.length

	m = [0 for j in [0..b.length] for i in [0..a.length]][0]

	for i in [0..a.length]
	m[i][0] = i

	for j in [0..b.length]
	m[0][j] = j

	for i in [1..a.length]
	for j in [1..b.length]
	if a[i - 1] == b[j - 1]
	x = 0
	else
	x = 1
	m[i][j] = Math.min(m[i - 1][j] + 1, m[i][ j - 1] + 1, m[i - 1][j - 1] + x)

	d = m[m.length-1][m[0].length-1]
	d = d / Math.max(a.length, b.length)
	return d

	##
	# Clustering with Ward method
	# @param dataset dataset to clustering
	# @param opt = {
	# maxDst : threshold of distance
	# minClst: threshold of number of clusters
	# formula: update formula
	# debug : get verbose
	# }
	# @return clusters = [ [val1, val2, …], [val3, val4, …], … ]
	doClustering = (dataset, opt) ->

	opt = opt \|\| {}
	maxDst = opt.maxDst \|\| Number.MAX_VALUE
	minClst = opt.minClst \|\| 1
	formula = opt.formula \|\| updateMatrixWard
	debug = opt.debug \|\| false

	# clusters contain only one item on the first step
	clsts = ([item] for item in dataset)

	# create distance matrix
	matrix = distanceMatrix(dataset, opt)
	loop
	# break if cluster length is less than the threshold
	break if clsts.length <= opt.minClst

	# detect minimum distance and their indices
	minDst = Number.MAX_VALUE
	len = matrix.length
	for i in [0..(len - 2)]
	for j in [(i + 1)..(len - 1)]
	if matrix[i][j] < minDst
	minDst = matrix[i][j]
	ia = i
	ib = j

	# break if smallest distance is less than the threshold
	break if minDst > maxDst

	# update matrix
	matrix = formula()(matrix, clsts, ia, ib)

	# concat nearest neighbors
	concated = clsts[ia].concat clsts[ib]
	clsts.splice ib, 1
	clsts.splice ia, 1, concated

	clsts

	##
	# create distance matrix
	# @param dataset = […]
	# @return distance matrix = [[…],[…], …]
	#
	distanceMatrix = (dataset, opt) ->
	opt = opt \|\| {}
	distance = opt.distance \|\| () -> Number.MAX_VALUE
	key = opt.key

	dsts = []
	i = 0
	while i < dataset.length
	dsts[i] = []
	dsts[i][i] = 0
	j = i + 1
	while j < dataset.length
	a = if key then dataset[i][key] else dataset[i]
	b = if key then dataset[j][key] else dataset[j]
	dsts[i][j] = distance(a, b)
	j++
	i++
	dsts

	updateMatrixSimpleLink = () ->
	alphaA = () -> 1/2
	alphaB = () -> 1/2
	beta = () -> 0
	gamma = () -> -1/2
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixCompleteLink = () ->
	alphaA = () -> 1/2
	alphaB = () -> 1/2
	beta = () -> 0
	gamma = () -> 1/2
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixGroupAve = () ->
	alphaA = (n1, n1a, n1b, n2) -> n1a / n1
	alphaB = (n1, n1a, n1b, n2) -> n1b / n1
	beta = () -> 0
	gamma = () -> 0
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixWPGMA = () ->
	alphaA = () -> 1/2
	alphaB = () -> 1/2
	beta = () -> 0
	gamma = () -> 0
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixWard = () ->
	alphaA = (n1, n1a, n1b, n2) -> (n1a + n2) / (n1 + n2)
	alphaB = (n1, n1a, n1b, n2) -> (n1b + n2) / (n1 + n2)
	beta = (n1, n1a, n1b, n2) -> -n2 / (n1 + n2)
	gamma = () -> 0
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixCentroid = () ->
	alphaA = (n1, n1a, n1b, n2) -> n1a / n1
	alphaB = (n1, n1a, n1b, n2) -> n1b / n1
	beta = (n1, n1a, n1b, n2) -> -n1an1b / n1n1
	gamma = () -> 0
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrixMedian = () ->
	alphaA = () -> 1/2
	alphaB = () -> 1/2
	beta = () -> -1/4
	gamma = () -> 0
	updateMatrix alphaA, alphaB, beta, gamma

	updateMatrix = (alphaA, alphaB, beta, gamma) ->
	alphaA = alphaA \|\| () -> 0
	alphaB = alphaB \|\| () -> 0
	beta = beta \|\| () -> 0
	gamma = gamma \|\| () -> 0

	(matrix1, clusters, i1a, i1b) ->
	c1a = clusters[i1a]
	c1b = clusters[i1b]
	n1a = c1a.length
	n1b = c1b.length
	n1 = n1a + n1b
	matrix2 = []

	getDist = (mtx, i1, i2) ->
	mtx[Math.min(i1,i2)][Math.max(i1,i2)]

	d1a1b = matrix1[i1a][i1b]

	for dists1, i in matrix1
	continue if i is i1b
	if i is i1a
	dists2 = []
	for d, j in dists1
	continue if j is i1b

	n2 = clusters[j].length
	ala = alphaA n1, n1a, n1b, n2
	alb = alphaB n1, n1a, n1b, n2
	bt = beta n1, n1a, n1b, n2
	gm = gamma n1, n1a, n1b, n2
	d1a2 = getDist matrix1, i1a, j
	d1b2 = getDist matrix1, i1b, j
	dist = alad1a2 + albd1b2 + btd1a1b + gmMath.abs(d1a2-d1b2)
	dists2.push dist
	matrix2.push dists2
	else
	dists2 = (d for d in dists1)
	dists2.splice i1b, 1
	matrix2.push dists2
	matrix2

	module.exports =
	doClustering : doClustering
	distanceNum : distanceNum
	distanceLatlng : distanceLatlng
	levenshtein : levenshtein
	updateMatrixSimpleLink : updateMatrixSimpleLink
	updateMatrixCompleteLink: updateMatrixCompleteLink
	updateMatrixGroupAve : updateMatrixGroupAve
	updateMatrixWPGMA : updateMatrixWPGMA
	updateMatrixWard : updateMatrixWard
	updateMatrixCentroid : updateMatrixCentroid
	updateMatrixMedian : updateMatrixMedian
	updateMatrix : updateMatrix
	fs = require "fs"

	# compile clustering-lance-williams.coffee to node_modules/clustering/index.js
	Clustering = require "clustering"

	NCLST_DEFAULT = 10
	DPATH_DEFAULT = '/tmp/dataset.txt'
	nclst = process.argv[2] \|\| NCLST_DEFAULT
	dpath = process.argv[3] \|\| DPATH_DEFAULT

	fs.readFile dpath, 'utf8', (err, data) ->
	throw err if err
	lines = data.split "\n"
	dataset = [ { id:i, val:line } for line, i in lines ][0]
	dataset = dataset.filter (line) -> line.val
	# console.log "dataset", dataset.length

	clusters = Clustering.doClustering dataset,
	minClst : nclst
	distance: Clustering.levenshtein
	formula : Clustering.updateMatrixWard
	key: "val"

	console.log "clusters", clusters