Last active
June 6, 2020 00:22
-
-
Save r2evans/e5531cbab8cf421d14ed to your computer and use it in GitHub Desktop.
lazy evaluation of a potentially very large expand.grid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Lazy expand.grid. | |
#' | |
#' Provide a lazy-eval for expand.grid, similar to python's | |
#' \code{xrange}, where the source may be too large to be fit into | |
#' memory but still accessible. | |
#' | |
#' This function returns a list of functions for accessing the lazy | |
#' \code{expand.grid}. | |
#' | |
#' The available methods within each object: | |
#' | |
#' \describe{ | |
#' | |
#' \item{nextItem(index):}{Retrieve the next item (row) from the design | |
#' space. If \code{index} is provided, set the internal counter to | |
#' that value and use it instead of what the next index would have | |
#' been otherwise. If a vector of numbers, pre-stage the indices and | |
#' retrieve the first one; subsequent calls (without \code{index}) to | |
#' \code{nextItem()} will retrieve each subsequent index.} | |
#' | |
#' \item{nextItems(index):}{If there is a pre-staged vector of indices | |
#' (set by \code{nextItem(index)}, \code{setIndex(index)}, or | |
#' \code{addIndex(index)}), retrieve a data.frame with all rows.} | |
#' | |
#' \item{getIndex():}{Retrieve the current index. Will always return a | |
#' single integer, regardless of the existence or length of pre-staged | |
#' indices.} | |
#' | |
#' \item{getIndices():}{Retrieve all indices, starting with the current | |
#' index and all pre-staged follow-on indices.} | |
#' | |
#' \item{getNextIndex():}{Retrieve the next index, whether a pre-staged | |
#' (user-defined) or auto-incrementing counter.} | |
#' | |
#' \item{setIndex(index, append., final.):}{Pre-stage the provided | |
#' index(ices). If \code{final.} is TRUE, then once the vector of | |
#' indices has been retrieved/exhausted, the iterator will consider | |
#' itself closed. Any command that directly sets/adds an index(ices) | |
#' will clear this flag.} | |
#' | |
#' \item{addIndex(index):}{Add indices to the list of pre-staged | |
#' indices.} | |
#' | |
#' } | |
#' | |
#' The available properties within each object: | |
#' | |
#' \describe{ | |
#' | |
#' \item{n:}{The size of the design space.} | |
#' | |
#' \item{factors:}{The factors, i.e., the arguments provided on the | |
#' initial call. The only difference will be if factors were unnamed | |
#' or not legal such as duplicated or spaces.} | |
#' | |
#' } | |
#' | |
#' This was provided as an answer on StackOverflow | |
#' (\link[SO]{https://stackoverflow.com/a/36144255/3358272}, for Wakan | |
#' Tanka). Credit to alexis_laz for suggesting the use of | |
#' \code{cumprod}, \code{lengths}, and calculation of the individual | |
#' factor indices. The speed using \code{cumprod} is much better than | |
#' my initial suggested use of \code{sapply}, though the former might | |
#' suffer if the design space exceeds \code{.Machine$integer.max}. | |
#' | |
#' @param ... factor levels, either named or unnamed; if unnamed, then | |
#' arbitrary names will be provided; names must be legal for | |
#' data.frames | |
#' @return list of functions | |
#' @export | |
#' @examples | |
#' \dontrun{ | |
#' iter <- lazyExpandGrid(1:1e2, 1:1e2, 1:1e2) | |
#' iter$nextItem() # retrieves the first item | |
#' iter$nextItem(5) # seeks to the fifth item and retrieves it | |
#' | |
#' iter$setIndex(101) | |
#' iter$nextItem() # retrieves 101st row | |
#' iter$nextItem() # retrieves 102nd row | |
#' | |
#' iter$nextItems(c(5,1,99)) # retrieves three rows and sets the counter to 99 | |
#' | |
#' iter <- lazyExpandGrid(a = 1:1e2, b = c('some', 'char', 'factors')) | |
#' while (row <- iter$nextItem()) { | |
#' # do something | |
#' row$b | |
#' } | |
#' | |
#' # optionally sampling the design space | |
#' iter$setIndex( sample(iter$n, size = 100), final. = TRUE ) | |
#' while (row <- iter$nextItem()) { | |
#' # do something | |
#' row$a | |
#' } | |
#' | |
#' } | |
lazyExpandGrid <- function(...) { | |
dots <- list(...) | |
dotnames <- names(dots) | |
if (is.null(dotnames)) { | |
dotnames <- paste0('Var', seq_along(dots)) | |
} | |
dotnames <- make.unique(make.names(dotnames)) | |
names(dots) <- dotnames | |
sizes <- lengths(dots) | |
indices <- cumprod(c(1L, sizes)) | |
final. <- preset <- FALSE | |
numfactors <- length(indices) | |
maxcount <- unname(indices[ length(indices) ]) | |
i <- 0 | |
env <- environment() | |
nextItem <- function(index) { | |
if (missing(index)) { | |
li <- length(i) | |
if (preset) { | |
if (li > 1) i <<- i[-1L] | |
thisi <- i[[1L]] | |
preset <<- (li > 2) | |
} else { | |
if (final.) return(NULL) | |
thisi <- (i <<- i + 1) | |
} | |
} else { | |
env$setIndex(index) | |
return(env$nextItem()) | |
} | |
if (thisi > maxcount || i < 1L) return(NULL) | |
structure(setNames(Map(`[[`, dots, (thisi - 1L) %% indices[-1L] %/% indices[-numfactors] + 1L), | |
dotnames), | |
row.names = as.character(thisi), class = 'data.frame') | |
} | |
nextItems <- function(index) { | |
if (! missing(index)) { | |
env$setIndex(index) | |
return(env$nextItems()) | |
} | |
li <- length(i) | |
if (li > 1) { | |
rn <- as.character(i[-1L]) | |
ret <- do.call(rbind.data.frame, Filter(Negate(is.logical), | |
lapply(2:length(i), function(ign) env$nextItem()))) | |
rownames(ret) <- rn | |
ret | |
} else { | |
env$nextItem() | |
} | |
} | |
setIndex <- function(index, append. = FALSE, final. = FALSE) { | |
isgood <- (index > 0) & (index <= maxcount) | |
if (! any(isgood)) { | |
stop(sprintf("'index' must have at least one positive integer no more than the design space size (%s)", | |
maxcount), | |
call. = FALSE) | |
} | |
if (! all(isgood)) { | |
warning('non-positive or too-high indices are invalid, ignored', call. = FALSE) | |
index <- index[isgood] | |
} | |
i <<- c(if (append.) i else i[[1L]], index) | |
preset <<- TRUE | |
final. <<- final. | |
} | |
getIndex <- function() return(i[[1L]]) | |
getIndices <- function() return(i) | |
getNextIndex <- function() if (length(i) > 1) i[[2L]] else i+1 | |
l <- list(nextItem = nextItem, nextItems = nextItems, | |
getIndex = getIndex, getIndices = getIndices, | |
setIndex = setIndex, getNextIndex = getNextIndex, | |
n = maxcount, factors = dots | |
) | |
class(l) <- c(class(l), 'lazyExpandGrid') | |
l | |
} | |
#' @export | |
print.lazyExpandGrid <- function(x, ...) { | |
e <- environment(x$getIndex) | |
cat(sprintf("lazyExpandGrid: %s factors, %s rows\n", e$numfactors - 1L, e$maxcount)) | |
cat(sprintf(" $ index : %s\n", e$i[[1L]])) | |
if (length(e$i) > 1) cat(sprintf(" $ next : %s\n", paste(e$i[-1L], collapse = ', '))) | |
if (e$final.) cat(" $ final\n") | |
} |
Shoot ... @kdorheim, I am just seeing this now. I really hate that GH didn't notify me somehow ...
Absolutely! I haven't used it much since I wrote it (mostly it comes up on StackOverflow every now and then), most of my problems are differently scoped these days.
Haha it is all good, the project ended up going in a different direction. Thanks for following up!!
(dang ... I could've been published ;-)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@r2evans great functions! I am developing an R package and would like to avoid a dependency on
expand.grid
and uselazyExpandGrid
instead. Would that be possible? What is the license for using this code?