r2evans · June 6, 2020 00:22 · kdorheim · Jun 12, 2018 · r2evans · Jun 6, 2020
diff --git a/lazyExpandGrid.R b/lazyExpandGrid.R
 #' Lazy expand.grid.
 #'
 #' Provide a lazy-eval for expand.grid, similar to python's
 #' \code{xrange}, where the source may be too large to be fit into
 #' memory but still accessible.
 #'
 #' This function returns a list of functions for accessing the lazy
 #' \code{expand.grid}.
 #'
 #' The available methods within each object:
 #'
 #' \describe{
 #' 
 #' \item{nextItem(index):}{Retrieve the next item (row) from the design
 #' space. If \code{index} is provided, set the internal counter to
 #' that value and use it instead of what the next index would have
 #' been otherwise. If a vector of numbers, pre-stage the indices and
 #' retrieve the first one; subsequent calls (without \code{index}) to
 #' \code{nextItem()} will retrieve each subsequent index.}
 #'
 #' \item{nextItems(index):}{If there is a pre-staged vector of indices
 #' (set by \code{nextItem(index)}, \code{setIndex(index)}, or
 #' \code{addIndex(index)}), retrieve a data.frame with all rows.}
 #'
 #' \item{getIndex():}{Retrieve the current index. Will always return a
 #' single integer, regardless of the existence or length of pre-staged
 #' indices.}
 #'
 #' \item{getIndices():}{Retrieve all indices, starting with the current
 #' index and all pre-staged follow-on indices.}
 #'
 #' \item{getNextIndex():}{Retrieve the next index, whether a pre-staged
 #' (user-defined) or auto-incrementing counter.}
 #'
 #' \item{setIndex(index, append., final.):}{Pre-stage the provided
 #' index(ices). If \code{final.} is TRUE, then once the vector of
 #' indices has been retrieved/exhausted, the iterator will consider
 #' itself closed. Any command that directly sets/adds an index(ices)
 #' will clear this flag.}
 #'
 #' \item{addIndex(index):}{Add indices to the list of pre-staged
 #' indices.}
 #'
 #' }
 #'
 #' The available properties within each object:
 #'
 #' \describe{
 #'
 #' \item{n:}{The size of the design space.}
 #'
 #' \item{factors:}{The factors, i.e., the arguments provided on the
 #' initial call. The only difference will be if factors were unnamed
 #' or not legal such as duplicated or spaces.}
 #'
 #' }
 #' 
 #' This was provided as an answer on StackOverflow
 #' (\link[SO]{https://stackoverflow.com/a/36144255/3358272}, for Wakan
 #' Tanka). Credit to alexis_laz for suggesting the use of
 #' \code{cumprod}, \code{lengths}, and calculation of the individual
 #' factor indices. The speed using \code{cumprod} is much better than
 #' my initial suggested use of \code{sapply}, though the former might
 #' suffer if the design space exceeds \code{.Machine$integer.max}.
 #' 
 #' @param ... factor levels, either named or unnamed; if unnamed, then
 #'   arbitrary names will be provided; names must be legal for
 #'   data.frames
 #' @return list of functions
 #' @export
 #' @examples
 #' \dontrun{
 #' iter <- lazyExpandGrid(1:1e2, 1:1e2, 1:1e2)
 #' iter$nextItem()  # retrieves the first item
 #' iter$nextItem(5) # seeks to the fifth item and retrieves it
 #' 
 #' iter$setIndex(101)
 #' iter$nextItem() # retrieves 101st row
 #' iter$nextItem() # retrieves 102nd row
 #' 
 #' iter$nextItems(c(5,1,99)) # retrieves three rows and sets the counter to 99
 #'
 #' iter <- lazyExpandGrid(a = 1:1e2, b = c('some', 'char', 'factors'))
 #' while (row <- iter$nextItem()) {
 #'   # do something
 #'   row$b
 #' }
 #'
 #' # optionally sampling the design space
 #' iter$setIndex( sample(iter$n, size = 100), final. = TRUE )
 #' while (row <- iter$nextItem()) {
 #'   # do something
 #'   row$a
 #' }
 #'
 #' }
 lazyExpandGrid <- function(...) {
  dots <- list(...)
  dotnames <- names(dots)
  if (is.null(dotnames)) {
    dotnames <- paste0('Var', seq_along(dots))
  }
  dotnames <- make.unique(make.names(dotnames))
  names(dots) <- dotnames
  sizes <- lengths(dots)
  indices <- cumprod(c(1L, sizes))
  final. <- preset <- FALSE
  numfactors <- length(indices)
  maxcount <- unname(indices[ length(indices) ])
  i <- 0
  env <- environment()

  nextItem <- function(index) {
    if (missing(index)) {
      li <- length(i)
      if (preset) {
        if (li > 1) i <<- i[-1L]
        thisi <- i[[1L]]
        preset <<- (li > 2)
      } else {
        if (final.) return(NULL)
        thisi <- (i <<- i + 1)
      }
    } else {
      env$setIndex(index)
      return(env$nextItem())
    }
    if (thisi > maxcount || i < 1L) return(NULL)
    structure(setNames(Map(`[[`, dots, (thisi - 1L) %% indices[-1L] %/% indices[-numfactors] + 1L),
                       dotnames),
              row.names = as.character(thisi), class = 'data.frame')
  }

  nextItems <- function(index) {
    if (! missing(index)) {
      env$setIndex(index)
      return(env$nextItems())
    }
    li <- length(i)
    if (li > 1) {
      rn <- as.character(i[-1L])
      ret <- do.call(rbind.data.frame, Filter(Negate(is.logical),
                                              lapply(2:length(i), function(ign) env$nextItem())))
      rownames(ret) <- rn
      ret
    } else {
      env$nextItem()
    }
  }

  setIndex <- function(index, append. = FALSE, final. = FALSE) {
    isgood <- (index > 0) & (index <= maxcount)
    if (! any(isgood)) {
      stop(sprintf("'index' must have at least one positive integer no more than the design space size (%s)",
                   maxcount),
           call. = FALSE)
    }
    if (! all(isgood)) {
      warning('non-positive or too-high indices are invalid, ignored', call. = FALSE)
      index <- index[isgood]
    }
    i <<- c(if (append.) i else i[[1L]], index)
    preset <<- TRUE
    final. <<- final.
  }

  getIndex <- function() return(i[[1L]])

  getIndices <- function() return(i)

  getNextIndex <- function() if (length(i) > 1) i[[2L]] else i+1

  l <- list(nextItem = nextItem, nextItems = nextItems,
            getIndex = getIndex, getIndices = getIndices,
            setIndex = setIndex, getNextIndex = getNextIndex,
            n = maxcount, factors = dots
            )
  class(l) <- c(class(l), 'lazyExpandGrid')
  l
 }

 #' @export
 print.lazyExpandGrid <- function(x, ...) {
  e <- environment(x$getIndex)
  cat(sprintf("lazyExpandGrid: %s factors, %s rows\n", e$numfactors - 1L, e$maxcount))
  cat(sprintf("  $ index : %s\n", e$i[[1L]]))
  if (length(e$i) > 1) cat(sprintf("  $ next  : %s\n", paste(e$i[-1L], collapse = ', ')))
  if (e$final.) cat("  $ final\n")
 }
	#' Lazy expand.grid.
	#'
	#' Provide a lazy-eval for expand.grid, similar to python's
	#' \code{xrange}, where the source may be too large to be fit into
	#' memory but still accessible.
	#'
	#' This function returns a list of functions for accessing the lazy
	#' \code{expand.grid}.
	#'
	#' The available methods within each object:
	#'
	#' \describe{
	#'
	#' \item{nextItem(index):}{Retrieve the next item (row) from the design
	#' space. If \code{index} is provided, set the internal counter to
	#' that value and use it instead of what the next index would have
	#' been otherwise. If a vector of numbers, pre-stage the indices and
	#' retrieve the first one; subsequent calls (without \code{index}) to
	#' \code{nextItem()} will retrieve each subsequent index.}
	#'
	#' \item{nextItems(index):}{If there is a pre-staged vector of indices
	#' (set by \code{nextItem(index)}, \code{setIndex(index)}, or
	#' \code{addIndex(index)}), retrieve a data.frame with all rows.}
	#'
	#' \item{getIndex():}{Retrieve the current index. Will always return a
	#' single integer, regardless of the existence or length of pre-staged
	#' indices.}
	#'
	#' \item{getIndices():}{Retrieve all indices, starting with the current
	#' index and all pre-staged follow-on indices.}
	#'
	#' \item{getNextIndex():}{Retrieve the next index, whether a pre-staged
	#' (user-defined) or auto-incrementing counter.}
	#'
	#' \item{setIndex(index, append., final.):}{Pre-stage the provided
	#' index(ices). If \code{final.} is TRUE, then once the vector of
	#' indices has been retrieved/exhausted, the iterator will consider
	#' itself closed. Any command that directly sets/adds an index(ices)
	#' will clear this flag.}
	#'
	#' \item{addIndex(index):}{Add indices to the list of pre-staged
	#' indices.}
	#'
	#' }
	#'
	#' The available properties within each object:
	#'
	#' \describe{
	#'
	#' \item{n:}{The size of the design space.}
	#'
	#' \item{factors:}{The factors, i.e., the arguments provided on the
	#' initial call. The only difference will be if factors were unnamed
	#' or not legal such as duplicated or spaces.}
	#'
	#' }
	#'
	#' This was provided as an answer on StackOverflow
	#' (\link[SO]{https://stackoverflow.com/a/36144255/3358272}, for Wakan
	#' Tanka). Credit to alexis_laz for suggesting the use of
	#' \code{cumprod}, \code{lengths}, and calculation of the individual
	#' factor indices. The speed using \code{cumprod} is much better than
	#' my initial suggested use of \code{sapply}, though the former might
	#' suffer if the design space exceeds \code{.Machine$integer.max}.
	#'
	#' @param ... factor levels, either named or unnamed; if unnamed, then
	#' arbitrary names will be provided; names must be legal for
	#' data.frames
	#' @return list of functions
	#' @export
	#' @examples
	#' \dontrun{
	#' iter <- lazyExpandGrid(1:1e2, 1:1e2, 1:1e2)
	#' iter$nextItem() # retrieves the first item
	#' iter$nextItem(5) # seeks to the fifth item and retrieves it
	#'
	#' iter$setIndex(101)
	#' iter$nextItem() # retrieves 101st row
	#' iter$nextItem() # retrieves 102nd row
	#'
	#' iter$nextItems(c(5,1,99)) # retrieves three rows and sets the counter to 99
	#'
	#' iter <- lazyExpandGrid(a = 1:1e2, b = c('some', 'char', 'factors'))
	#' while (row <- iter$nextItem()) {
	#' # do something
	#' row$b
	#' }
	#'
	#' # optionally sampling the design space
	#' iter$setIndex( sample(iter$n, size = 100), final. = TRUE )
	#' while (row <- iter$nextItem()) {
	#' # do something
	#' row$a
	#' }
	#'
	#' }
	lazyExpandGrid <- function(...) {
	dots <- list(...)
	dotnames <- names(dots)
	if (is.null(dotnames)) {
	dotnames <- paste0('Var', seq_along(dots))
	}
	dotnames <- make.unique(make.names(dotnames))
	names(dots) <- dotnames
	sizes <- lengths(dots)
	indices <- cumprod(c(1L, sizes))
	final. <- preset <- FALSE
	numfactors <- length(indices)
	maxcount <- unname(indices[ length(indices) ])
	i <- 0
	env <- environment()

	nextItem <- function(index) {
	if (missing(index)) {
	li <- length(i)
	if (preset) {
	if (li > 1) i <<- i[-1L]
	thisi <- i[[1L]]
	preset <<- (li > 2)
	} else {
	if (final.) return(NULL)
	thisi <- (i <<- i + 1)
	}
	} else {
	env$setIndex(index)
	return(env$nextItem())
	}
	if (thisi > maxcount \|\| i < 1L) return(NULL)
	structure(setNames(Map(`[[`, dots, (thisi - 1L) %% indices[-1L] %/% indices[-numfactors] + 1L),
	dotnames),
	row.names = as.character(thisi), class = 'data.frame')
	}

	nextItems <- function(index) {
	if (! missing(index)) {
	env$setIndex(index)
	return(env$nextItems())
	}
	li <- length(i)
	if (li > 1) {
	rn <- as.character(i[-1L])
	ret <- do.call(rbind.data.frame, Filter(Negate(is.logical),
	lapply(2:length(i), function(ign) env$nextItem())))
	rownames(ret) <- rn
	ret
	} else {
	env$nextItem()
	}
	}

	setIndex <- function(index, append. = FALSE, final. = FALSE) {
	isgood <- (index > 0) & (index <= maxcount)
	if (! any(isgood)) {
	stop(sprintf("'index' must have at least one positive integer no more than the design space size (%s)",
	maxcount),
	call. = FALSE)
	}
	if (! all(isgood)) {
	warning('non-positive or too-high indices are invalid, ignored', call. = FALSE)
	index <- index[isgood]
	}
	i <<- c(if (append.) i else i[[1L]], index)
	preset <<- TRUE
	final. <<- final.
	}

	getIndex <- function() return(i[[1L]])

	getIndices <- function() return(i)

	getNextIndex <- function() if (length(i) > 1) i[[2L]] else i+1

	l <- list(nextItem = nextItem, nextItems = nextItems,
	getIndex = getIndex, getIndices = getIndices,
	setIndex = setIndex, getNextIndex = getNextIndex,
	n = maxcount, factors = dots
	)
	class(l) <- c(class(l), 'lazyExpandGrid')
	l
	}

	#' @export
	print.lazyExpandGrid <- function(x, ...) {
	e <- environment(x$getIndex)
	cat(sprintf("lazyExpandGrid: %s factors, %s rows\n", e$numfactors - 1L, e$maxcount))
	cat(sprintf(" $ index : %s\n", e$i[[1L]]))
	if (length(e$i) > 1) cat(sprintf(" $ next : %s\n", paste(e$i[-1L], collapse = ', ')))
	if (e$final.) cat(" $ final\n")
	}