Skip to content

Instantly share code, notes, and snippets.

@erikerlandson
Created August 21, 2014 22:51
Show Gist options
  • Save erikerlandson/f4b9b9a5c9469f2d9006 to your computer and use it in GitHub Desktop.
Save erikerlandson/f4b9b9a5c9469f2d9006 to your computer and use it in GitHub Desktop.
Suggestion for simple distance metric (and measure) design
import breeze.linalg.{Vector => BV, DenseVector => DBV}
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Vector, DenseVector}
trait DistanceMeasure extends Function2[BV[Double], BV[Double], Double] with Serializable {
// each measure/metric defines for itself:
override def apply(v1: BV[Double], v2: BV[Double]): Double
// a catch-all overloading of "()" for spark vectors
// can also be overridden on a per-class basis, if it is advantageous
def apply(v1: Vector, v2: Vector): Double = this(v1.toBreeze, v2.toBreeze)
}
trait DistanceMetric extends DistanceMeasure
class EuclideanDistance extends DistanceMetric {
override def apply (v1: BV[Double], v2: BV[Double]): Double = {
val d = v1 - v2
Math.sqrt(d dot d)
}
}
class WeightedEuclideanDistance(val weights: BV[Double]) extends DistanceMetric {
override def apply (v1: BV[Double], v2: BV[Double]): Double = {
val d = v1 - v2
Math.sqrt(d dot (weights :* d))
}
}
// a measure, not a metric
class CosineDistance extends DistanceMeasure {
override def apply (v1: BV[Double], v2: BV[Double]): Double = {
1.0 - ((v1 dot v2) / Math.sqrt((v1 dot v1) * (v2 dot v2)))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment