gdbassett · February 27, 2016 16:45 · gdbassett · Feb 17, 2016
diff --git a/linearKMeans.R b/linearKMeans.R
 #' @param df Dataframe with x and y columns.  (Hopefully in the future this can be x)
 #' @param nlines The number of clusters.
 #' @param ab a dataframe with a 'slopes' and 'intercepts' column and one row per initial line.  Dimensions must match nlines.
 #' @param maxiter The maximum number of iterations to do
 #' @export
 #' @examples
 linearKMeans <- function(df, ab=NULL, nlines=0, maxiter=1000) {
  # default number of lines
  nlines_default <- 5
  
  require(sp)
  require(rgeos)
  require(stats)
  # get initial lines
  if (is.null(ab)) {
    if (nlines == 0) {
      nlines <- nlines_default
    }
    # pick default lines if none provided
    slopes <- df$y/df$x
    ab <- data.frame(intercepts=rep(0, nlines), slopes=(max(df$y)/(nlines-1):0) / (max(df$x)/0:(nlines-1)))
    ab[1, 'slopes'] <- min(slopes)
    ab[nlines, 'slopes'] <- max(slopes)
  } else {
    if ((nlines != 0) & (nlines != nrow(ab))) {
      warning("nlines and ab both present. Number of rows in ab will be used and nlines ignored.")
    }
    nlines <- nrow(ab)
  }
    
  # fit the intercept/slope to ablines
  max_x <- max(df$x, na.rm=TRUE)
  max_y <- max(df$y, na.rm=TRUE)
  
  old_line <- rep(NA, nrow(df))
  j <- 0
  
  while(!identical(old_line, df$line) & j < maxiter) {
    j <- j + 1
    old_line <- m
    
    lines1 <- apply(ab, MARGIN=1, function(n) {
      l <- data.frame(x=c(0,0), y=c(n['intercepts'],0))
      if (n['slopes'] * max_x + n['intercepts'] > max_y) {
        l[1, "x"] <- (max_y - n['intercepts'])/n['slopes']
        l[1, "y"] <- max_y
      } else {
        l[1, "x"] <- max_x
        l[1, "y"] <- max_x * n['slopes'] + n['intercepts']
      }
      list(Line(l))
    })
    lines2 <- list()
    for (i in seq(length(lines1))) {
      lines2 <- c(lines2, Lines(lines1[[i]], ID=as.character(i)))
    }
    lines2 <- SpatialLines(lines2)  
    
    # associate points with lines
    m <- gDistance(lines2, SpatialPoints(df[ , c("x", "y")]), byid=TRUE)
    m <- as.data.frame(m)
    df$line <- apply(m, MARGIN=1, function(x) {names(m)[which.min(x)]})
    
    # Update lines
    # (It would be nice to pass this section in as a argument)
    models <- df %>% group_by(line) %>% do(model = lm(y~x, data=.))
    # Pegging intercept to 0 as we know it inherently
    models <- df %>% group_by(line) %>% do(model = lm(y ~ 0 + x, data=.))
    ab <- data.frame(intercepts=rep(0, nlines), slopes=rep(0, nlines))
    for (i in seq(nrow(models))) {
      ab[i, ] <- models[i, ]$model[[1]]$coefficients
    }
    #print ab
    #ab <- ab[duplicated(ab), ]
  }
  if (j >= maxiter) {
    warning(paste("Did not converge after", j, "iterations."))
  } else {
    message(paste("Converged after", j, "iterations."))
  }
  list(models=models$model, ab=ab, classifications=df$line, distances=m, data=df[, c("x", "y")])
 }
	#' @param df Dataframe with x and y columns. (Hopefully in the future this can be x)
	#' @param nlines The number of clusters.
	#' @param ab a dataframe with a 'slopes' and 'intercepts' column and one row per initial line. Dimensions must match nlines.
	#' @param maxiter The maximum number of iterations to do
	#' @export
	#' @examples
	linearKMeans <- function(df, ab=NULL, nlines=0, maxiter=1000) {
	# default number of lines
	nlines_default <- 5

	require(sp)
	require(rgeos)
	require(stats)
	# get initial lines
	if (is.null(ab)) {
	if (nlines == 0) {
	nlines <- nlines_default
	}
	# pick default lines if none provided
	slopes <- df$y/df$x
	ab <- data.frame(intercepts=rep(0, nlines), slopes=(max(df$y)/(nlines-1):0) / (max(df$x)/0:(nlines-1)))
	ab[1, 'slopes'] <- min(slopes)
	ab[nlines, 'slopes'] <- max(slopes)
	} else {
	if ((nlines != 0) & (nlines != nrow(ab))) {
	warning("nlines and ab both present. Number of rows in ab will be used and nlines ignored.")
	}
	nlines <- nrow(ab)
	}

	# fit the intercept/slope to ablines
	max_x <- max(df$x, na.rm=TRUE)
	max_y <- max(df$y, na.rm=TRUE)

	old_line <- rep(NA, nrow(df))
	j <- 0

	while(!identical(old_line, df$line) & j < maxiter) {
	j <- j + 1
	old_line <- m

	lines1 <- apply(ab, MARGIN=1, function(n) {
	l <- data.frame(x=c(0,0), y=c(n['intercepts'],0))
	if (n['slopes'] * max_x + n['intercepts'] > max_y) {
	l[1, "x"] <- (max_y - n['intercepts'])/n['slopes']
	l[1, "y"] <- max_y
	} else {
	l[1, "x"] <- max_x
	l[1, "y"] <- max_x * n['slopes'] + n['intercepts']
	}
	list(Line(l))
	})
	lines2 <- list()
	for (i in seq(length(lines1))) {
	lines2 <- c(lines2, Lines(lines1[[i]], ID=as.character(i)))
	}
	lines2 <- SpatialLines(lines2)

	# associate points with lines
	m <- gDistance(lines2, SpatialPoints(df[ , c("x", "y")]), byid=TRUE)
	m <- as.data.frame(m)
	df$line <- apply(m, MARGIN=1, function(x) {names(m)[which.min(x)]})

	# Update lines
	# (It would be nice to pass this section in as a argument)
	models <- df %>% group_by(line) %>% do(model = lm(y~x, data=.))
	# Pegging intercept to 0 as we know it inherently
	models <- df %>% group_by(line) %>% do(model = lm(y ~ 0 + x, data=.))
	ab <- data.frame(intercepts=rep(0, nlines), slopes=rep(0, nlines))
	for (i in seq(nrow(models))) {
	ab[i, ] <- models[i, ]$model[[1]]$coefficients
	}
	#print ab
	#ab <- ab[duplicated(ab), ]
	}
	if (j >= maxiter) {
	warning(paste("Did not converge after", j, "iterations."))
	} else {
	message(paste("Converged after", j, "iterations."))
	}
	list(models=models$model, ab=ab, classifications=df$line, distances=m, data=df[, c("x", "y")])
	}