Created
March 9, 2021 22:11
-
-
Save mgadzhi/cd513f7a9ad3feb45240f8732320e06f to your computer and use it in GitHub Desktop.
First version of DBSCAN in Kotlin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import kotlin.math.* | |
// First, we need to define, what a 'distance' is | |
interface Distance<T> { | |
fun T.distance(to: T): Double | |
} | |
// Now let's define few concrete data classes that will be used to store the data | |
data class Location(val lat: Double, val lon: Double) | |
data class Point(val x: Double, val y: Double) | |
// It is possible to define different distance functions for different data classes | |
// It is also possible to define different distance functions for the same data class | |
// For example, we could define full blown 'geodesic' distance for locations | |
// I tried to reproduce something like 'Typeclass' pattern which I would use in Scala/Haskell | |
val euclideanDistance = object : Distance<Point> { | |
override fun Point.distance(to: Point): Double { | |
return sqrt((x - to.x).pow(2) + (y - to.y).pow(2)) | |
} | |
} | |
val haversineDistance = object : Distance<Location> { | |
override fun Location.distance(to: Location): Double { | |
val R = 6371000.0 | |
val lat1 = Math.toRadians(lat) | |
val lat2 = Math.toRadians(to.lat) | |
val lon1 = Math.toRadians(lon) | |
val lon2 = Math.toRadians(to.lon) | |
return 2 * R * asin(sqrt(((lat1 - lat2) / 2).pow(2) + cos(lat1) * cos(lat2) * sin((lon1 - lon2) / 2).pow(2))) | |
} | |
} | |
// We will only require a 'cluster' to have an id for now | |
// And let's say we use integers as identifiers | |
data class Cluster(val id: Int) | |
// Now we can try to define what types can be 'clustered' | |
// 'We know how to compute a distance between 2 objects of a type' => 'We can cluster a list of such objects' | |
interface ClusteringAlgorithm<T, D> { | |
fun<D: Distance<T>> fit_transform(xs: List<T>, dist: D): List<Cluster> | |
} | |
// I first implemented a minimalistic version of a Matrix to hold pairwise distances, | |
// but I didn't really use everything from here after all. | |
// On a bright side, it's easy to add a method to ClusteringAlgorithm that would get | |
// an arbitrary pairwise distance matrix and skip its own distance calculations | |
// as it's done in sklearn | |
interface DummyMatrix<T> { | |
fun get(i: Int, j: Int): T | |
fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean> | |
} | |
open class DummySquareMatrix<T>(val entries: List<T>, val dim: Int): DummyMatrix<T> { | |
override fun get(i: Int, j: Int): T { | |
return entries[i * dim + j] | |
} | |
override fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean> { | |
return DummySquareMatrix(entries.map(f), dim) | |
} | |
} | |
class PairwiseDistanceMatrix private constructor(entries: List<Double>, dim: Int): DummySquareMatrix<Double>(entries, dim) { | |
companion object { | |
operator fun <T, D: Distance<T>> invoke(locs: List<T>, dist: D): PairwiseDistanceMatrix { | |
val n = locs.size | |
val distances = mutableListOf<Double>() | |
for (i in 0 until n) { | |
for (j in 0 until n) { | |
distances.add(dist.run { locs[i].distance(locs[j]) }) | |
} | |
} | |
return PairwiseDistanceMatrix(distances, n) | |
} | |
} | |
} | |
// Simplified implementation of DBSCAN | |
// It doesn't take into account edge points | |
class DBSCAN(val eps: Double, val minPts: Int): ClusteringAlgorithm<Location, Distance<Location>> { | |
override fun <D : Distance<Location>> fit_transform(xs: List<Location>, dist: D): List<Cluster> { | |
val result = MutableList(xs.size) {Cluster(-1)} | |
var c = 0 | |
val distances = PairwiseDistanceMatrix(xs, dist) | |
var neighbors: MutableList<Int> | |
for (i in xs.indices) { | |
if (result[i].id != -1) { | |
continue | |
} | |
neighbors = mutableListOf() | |
for (j in xs.indices) { | |
if (distances.get(i, j) < eps) { | |
neighbors.add(j) | |
} | |
} | |
if (neighbors.size < minPts) { | |
continue | |
} | |
c += 1 | |
for (j in neighbors) { | |
if(result[j].id == -1) { | |
result[j] = Cluster(c) | |
} | |
} | |
} | |
return result | |
} | |
} | |
fun main() { | |
val a = Point(1.0, 1.0) | |
val b = Point(0.0, 0.0) | |
val d = euclideanDistance.run { a.distance(b) } | |
println(d) | |
val loc1 = Location(-34.83333, -58.5166646) | |
val loc2 = Location(49.0083899664, 2.53844117956) | |
val d2 = haversineDistance.run{ loc1.distance(loc2) } / 1000 | |
val d3 = haversineDistance.run{ loc2.distance(loc1) } / 1000 | |
val d4 = haversineDistance.run{ loc1.distance(loc1) } / 1000 | |
println(d2) | |
println(d3) | |
println(d4) | |
val dataset = listOf<Location>( | |
// Groenplaats | |
Location(51.219227, 4.401766), | |
Location(51.218729, 4.401970), | |
Location(51.219162, 4.401155), | |
// Sentiance | |
Location(51.196732, 4.407988), | |
Location(51.196743, 4.408003), | |
Location(51.196677, 4.408226), | |
// Stadspark | |
Location(51.211933, 4.413849), | |
Location(51.212721, 4.414198), | |
Location(51.212113, 4.414257), | |
) | |
val dbscan = DBSCAN(150.0, 2) | |
val dbscanClusters = dbscan.fit_transform(dataset, haversineDistance) | |
println(dbscanClusters.map { it.id }) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment