Created
March 9, 2021 22:26
-
-
Save mgadzhi/b2e8626556ae3b91f9bca1e37268e0ee to your computer and use it in GitHub Desktop.
Simplified location clustering in Kotlin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import kotlin.math.* | |
// First, we need to define, what a 'distance' is | |
interface Distance<T> { | |
fun T.distance(to: T): Double | |
} | |
// Now let's define few concrete data classes that will be used to store the data | |
data class Location(val lat: Double, val lon: Double) | |
data class Point(val x: Double, val y: Double) | |
// It is possible to define different distance functions for different data classes | |
// It is also possible to define different distance functions for the same data class | |
// For example, we could define full blown 'geodesic' distance for locations | |
// I tried to reproduce something like 'Typeclass' pattern which I would use in Scala/Haskell | |
val euclideanDistance = object : Distance<Point> { | |
override fun Point.distance(to: Point): Double { | |
return sqrt((x - to.x).pow(2) + (y - to.y).pow(2)) | |
} | |
} | |
val haversineDistance = object : Distance<Location> { | |
override fun Location.distance(to: Location): Double { | |
val R = 6371000.0 | |
val lat1 = Math.toRadians(lat) | |
val lat2 = Math.toRadians(to.lat) | |
val lon1 = Math.toRadians(lon) | |
val lon2 = Math.toRadians(to.lon) | |
return 2 * R * asin(sqrt(((lat1 - lat2) / 2).pow(2) + cos(lat1) * cos(lat2) * sin((lon1 - lon2) / 2).pow(2))) | |
} | |
} | |
// We will only require a 'cluster' to have an id for now | |
// And let's say we use integers as identifiers | |
//data class Cluster(val id: Int) | |
sealed class Cluster | |
object Unknown: Cluster() | |
object Outsider: Cluster() | |
data class Identified(val id: Int): Cluster() | |
// Now we can try to define what types can be 'clustered' | |
// 'We know how to compute a distance between 2 objects of a type' => 'We can cluster a list of such objects' | |
interface ClusteringAlgorithm<T, D> { | |
fun<D: Distance<T>> fit_transform(xs: List<T>, dist: D): List<Cluster> | |
} | |
// I first implemented a minimalistic version of a Matrix to hold pairwise distances, | |
// but I didn't really use everything from here after all. | |
// On a bright side, it's easy to add a method to ClusteringAlgorithm that would get | |
// an arbitrary pairwise distance matrix and skip its own distance calculations | |
// as it's done in sklearn | |
interface DummyMatrix<T> { | |
fun get(i: Int, j: Int): T | |
fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean> | |
} | |
open class DummySquareMatrix<T>(val entries: List<T>, val dim: Int): DummyMatrix<T> { | |
override fun get(i: Int, j: Int): T { | |
return entries[i * dim + j] | |
} | |
override fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean> { | |
return DummySquareMatrix(entries.map(f), dim) | |
} | |
} | |
class PairwiseDistanceMatrix private constructor(entries: List<Double>, dim: Int): DummySquareMatrix<Double>(entries, dim) { | |
companion object { | |
operator fun <T, D: Distance<T>> invoke(locs: List<T>, dist: D): PairwiseDistanceMatrix { | |
val n = locs.size | |
val distances = mutableListOf<Double>() | |
for (i in 0 until n) { | |
for (j in 0 until n) { | |
distances.add(dist.run { locs[i].distance(locs[j]) }) | |
} | |
} | |
return PairwiseDistanceMatrix(distances, n) | |
} | |
} | |
} | |
// Simplified implementation of DBSCAN | |
// It doesn't take into account edge points | |
class DBSCAN(val eps: Double, val minPts: Int): ClusteringAlgorithm<Location, Distance<Location>> { | |
override fun <D : Distance<Location>> fit_transform(xs: List<Location>, dist: D): List<Cluster> { | |
val result = MutableList<Cluster>(xs.size) {Unknown} | |
var c = 0 | |
val distances = PairwiseDistanceMatrix(xs, dist) | |
var neighbors: MutableList<Int> | |
for (i in xs.indices) { | |
if (result[i] !is Unknown) { | |
continue | |
} | |
neighbors = mutableListOf() | |
for (j in xs.indices) { | |
if (distances.get(i, j) < eps) { | |
neighbors.add(j) | |
} | |
} | |
if (neighbors.size < minPts) { | |
result[i] = Outsider | |
continue | |
} | |
c += 1 | |
for (j in neighbors) { | |
if (result[j] is Outsider) { | |
result[j] = Identified(c) | |
} | |
if(result[j] !is Unknown) { | |
continue | |
} | |
result[j] = Identified(c) | |
} | |
} | |
return result | |
} | |
} | |
fun main() { | |
val a = Point(1.0, 1.0) | |
val b = Point(0.0, 0.0) | |
val d = euclideanDistance.run { a.distance(b) } | |
println(d) // -> 1.4142135623730951 | |
val loc1 = Location(-34.83333, -58.5166646) | |
val loc2 = Location(49.0083899664, 2.53844117956) | |
val d2 = haversineDistance.run{ loc1.distance(loc2) } / 1000 | |
val d3 = haversineDistance.run{ loc2.distance(loc1) } / 1000 | |
val d4 = haversineDistance.run{ loc1.distance(loc1) } / 1000 | |
println(d2) // -> 12275.31211886113 | |
println(d3) // -> 12275.31211886113 | |
println(d4) // -> 0.0 | |
// Thus, d(a, b) = d(b, a) and d(a, a) = 0. At least that seems correct. | |
val dataset = listOf<Location>( | |
// Groenplaats | |
Location(51.219227, 4.401766), | |
Location(51.218729, 4.401970), | |
Location(51.219162, 4.401155), | |
// Sentiance | |
Location(51.196732, 4.407988), | |
Location(51.196743, 4.408003), | |
Location(51.196677, 4.408226), | |
// Stadspark | |
Location(51.211933, 4.413849), | |
Location(51.212721, 4.414198), | |
Location(51.212113, 4.414257), | |
// Noise | |
Location(0.0, 0.0) | |
) | |
val dbscan = DBSCAN(150.0, 2) | |
val dbscanClusters = dbscan.fit_transform(dataset, haversineDistance) | |
println(dbscanClusters) | |
/* -> [Identified(id=1), Identified(id=1), Identified(id=1), | |
Identified(id=2), Identified(id=2), Identified(id=2), | |
Identified(id=3), Identified(id=3), Identified(id=3), | |
Outsider@15975490] | |
Three toy clusters are identified and the noisy outsider is also detected. | |
*/ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment