Skip to content

Instantly share code, notes, and snippets.

@gdbassett
Last active February 27, 2016 16:45
Show Gist options
  • Save gdbassett/65b9e320d18983141149 to your computer and use it in GitHub Desktop.
Save gdbassett/65b9e320d18983141149 to your computer and use it in GitHub Desktop.
A quick function to produce a kmeans like calculation, but using a line in place of the point centroid. Used to try and classify multiple linear relationships in a dataset.
#' @param df Dataframe with x and y columns. (Hopefully in the future this can be x)
#' @param nlines The number of clusters.
#' @param ab a dataframe with a 'slopes' and 'intercepts' column and one row per initial line. Dimensions must match nlines.
#' @param maxiter The maximum number of iterations to do
#' @export
#' @examples
linearKMeans <- function(df, ab=NULL, nlines=0, maxiter=1000) {
# default number of lines
nlines_default <- 5
require(sp)
require(rgeos)
require(stats)
# get initial lines
if (is.null(ab)) {
if (nlines == 0) {
nlines <- nlines_default
}
# pick default lines if none provided
slopes <- df$y/df$x
ab <- data.frame(intercepts=rep(0, nlines), slopes=(max(df$y)/(nlines-1):0) / (max(df$x)/0:(nlines-1)))
ab[1, 'slopes'] <- min(slopes)
ab[nlines, 'slopes'] <- max(slopes)
} else {
if ((nlines != 0) & (nlines != nrow(ab))) {
warning("nlines and ab both present. Number of rows in ab will be used and nlines ignored.")
}
nlines <- nrow(ab)
}
# fit the intercept/slope to ablines
max_x <- max(df$x, na.rm=TRUE)
max_y <- max(df$y, na.rm=TRUE)
old_line <- rep(NA, nrow(df))
j <- 0
while(!identical(old_line, df$line) & j < maxiter) {
j <- j + 1
old_line <- m
lines1 <- apply(ab, MARGIN=1, function(n) {
l <- data.frame(x=c(0,0), y=c(n['intercepts'],0))
if (n['slopes'] * max_x + n['intercepts'] > max_y) {
l[1, "x"] <- (max_y - n['intercepts'])/n['slopes']
l[1, "y"] <- max_y
} else {
l[1, "x"] <- max_x
l[1, "y"] <- max_x * n['slopes'] + n['intercepts']
}
list(Line(l))
})
lines2 <- list()
for (i in seq(length(lines1))) {
lines2 <- c(lines2, Lines(lines1[[i]], ID=as.character(i)))
}
lines2 <- SpatialLines(lines2)
# associate points with lines
m <- gDistance(lines2, SpatialPoints(df[ , c("x", "y")]), byid=TRUE)
m <- as.data.frame(m)
df$line <- apply(m, MARGIN=1, function(x) {names(m)[which.min(x)]})
# Update lines
# (It would be nice to pass this section in as a argument)
models <- df %>% group_by(line) %>% do(model = lm(y~x, data=.))
# Pegging intercept to 0 as we know it inherently
models <- df %>% group_by(line) %>% do(model = lm(y ~ 0 + x, data=.))
ab <- data.frame(intercepts=rep(0, nlines), slopes=rep(0, nlines))
for (i in seq(nrow(models))) {
ab[i, ] <- models[i, ]$model[[1]]$coefficients
}
#print ab
#ab <- ab[duplicated(ab), ]
}
if (j >= maxiter) {
warning(paste("Did not converge after", j, "iterations."))
} else {
message(paste("Converged after", j, "iterations."))
}
list(models=models$model, ab=ab, classifications=df$line, distances=m, data=df[, c("x", "y")])
}
@gdbassett
Copy link
Author

It helps to run, remove irrelevant or redundant lines, and then use those as starting lines for a second run.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment