Last active
February 27, 2016 16:45
-
-
Save gdbassett/65b9e320d18983141149 to your computer and use it in GitHub Desktop.
A quick function to produce a kmeans like calculation, but using a line in place of the point centroid. Used to try and classify multiple linear relationships in a dataset.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @param df Dataframe with x and y columns. (Hopefully in the future this can be x) | |
#' @param nlines The number of clusters. | |
#' @param ab a dataframe with a 'slopes' and 'intercepts' column and one row per initial line. Dimensions must match nlines. | |
#' @param maxiter The maximum number of iterations to do | |
#' @export | |
#' @examples | |
linearKMeans <- function(df, ab=NULL, nlines=0, maxiter=1000) { | |
# default number of lines | |
nlines_default <- 5 | |
require(sp) | |
require(rgeos) | |
require(stats) | |
# get initial lines | |
if (is.null(ab)) { | |
if (nlines == 0) { | |
nlines <- nlines_default | |
} | |
# pick default lines if none provided | |
slopes <- df$y/df$x | |
ab <- data.frame(intercepts=rep(0, nlines), slopes=(max(df$y)/(nlines-1):0) / (max(df$x)/0:(nlines-1))) | |
ab[1, 'slopes'] <- min(slopes) | |
ab[nlines, 'slopes'] <- max(slopes) | |
} else { | |
if ((nlines != 0) & (nlines != nrow(ab))) { | |
warning("nlines and ab both present. Number of rows in ab will be used and nlines ignored.") | |
} | |
nlines <- nrow(ab) | |
} | |
# fit the intercept/slope to ablines | |
max_x <- max(df$x, na.rm=TRUE) | |
max_y <- max(df$y, na.rm=TRUE) | |
old_line <- rep(NA, nrow(df)) | |
j <- 0 | |
while(!identical(old_line, df$line) & j < maxiter) { | |
j <- j + 1 | |
old_line <- m | |
lines1 <- apply(ab, MARGIN=1, function(n) { | |
l <- data.frame(x=c(0,0), y=c(n['intercepts'],0)) | |
if (n['slopes'] * max_x + n['intercepts'] > max_y) { | |
l[1, "x"] <- (max_y - n['intercepts'])/n['slopes'] | |
l[1, "y"] <- max_y | |
} else { | |
l[1, "x"] <- max_x | |
l[1, "y"] <- max_x * n['slopes'] + n['intercepts'] | |
} | |
list(Line(l)) | |
}) | |
lines2 <- list() | |
for (i in seq(length(lines1))) { | |
lines2 <- c(lines2, Lines(lines1[[i]], ID=as.character(i))) | |
} | |
lines2 <- SpatialLines(lines2) | |
# associate points with lines | |
m <- gDistance(lines2, SpatialPoints(df[ , c("x", "y")]), byid=TRUE) | |
m <- as.data.frame(m) | |
df$line <- apply(m, MARGIN=1, function(x) {names(m)[which.min(x)]}) | |
# Update lines | |
# (It would be nice to pass this section in as a argument) | |
models <- df %>% group_by(line) %>% do(model = lm(y~x, data=.)) | |
# Pegging intercept to 0 as we know it inherently | |
models <- df %>% group_by(line) %>% do(model = lm(y ~ 0 + x, data=.)) | |
ab <- data.frame(intercepts=rep(0, nlines), slopes=rep(0, nlines)) | |
for (i in seq(nrow(models))) { | |
ab[i, ] <- models[i, ]$model[[1]]$coefficients | |
} | |
#print ab | |
#ab <- ab[duplicated(ab), ] | |
} | |
if (j >= maxiter) { | |
warning(paste("Did not converge after", j, "iterations.")) | |
} else { | |
message(paste("Converged after", j, "iterations.")) | |
} | |
list(models=models$model, ab=ab, classifications=df$line, distances=m, data=df[, c("x", "y")]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It helps to run, remove irrelevant or redundant lines, and then use those as starting lines for a second run.