Created
January 13, 2023 23:28
-
-
Save msummersgill/10a2a25273f2018df946a14b1f755496 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(fastmatch) | |
#' Add rows to a data table in a memory efficient, by-referencesque manner | |
#' | |
#' This mimics the by-reference functionality `DT[, new_col := value]`, but | |
#' for rows instead. The rows in question are assigned at the end of the data | |
#' table. If the data table is keyed it is automatically reordered after the | |
#' operation. If not this function will preserve order of existing rows, but | |
#' will not preserve sortedness. | |
#' | |
#' This function will take the rows to add from a list of columns or generally | |
#' anything that can be named and converted or coerced to data frame. | |
#' The list may specify less columns than present in the data table. In this | |
#' case the rest is filled with NA. The list may not specify more columns than | |
#' present in the data table. Columns are matched by names if the list is named | |
#' or by position if not. The list may not have names not present in the data | |
#' table. | |
#' | |
#' Note that this operation is memory efficient as it will add the rows for | |
#' one column at a time, only requiring reallocation of single columns at a | |
#' time. This function will change the original data table by reference. | |
#' | |
#' This function will not affect shallow copies of the data table. | |
#' | |
#' @param .dt A data table | |
#' @param value A list (or a data frame). Must have at most as many elements as | |
#' there are columns in \param{.dt}. If unnamed this will be applied to | |
#' first columns in \param{.dt}, else it will by applied by name. Must | |
#' not have names not present in \param{.dt}. | |
#' @return \param{.dt} (invisible) | |
data_table_add_rows <- function(.dt, value) { | |
if (length(value) > ncol(.dt)) { | |
rlang::abort(glue::glue("Trying to update data table with {ncol(.dt) | |
} columns with {length(value)} columns.")) | |
} | |
if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))] | |
value <- as.data.frame(value) | |
if (any(!(names(value) %in% names(.dt)))) { | |
rlang::abort(glue::glue("Trying to update data table with columns { | |
paste(setdiff(names(value), names(.dt)), collapse = ', ') | |
} not present in original data table.")) | |
} | |
value[setdiff(names(.dt), names(value))] <- NA | |
k <- data.table::key(.dt) | |
temp_dt <- data.table::data.table() | |
for (col in c(names(.dt))) { | |
val <- c(.dt[[col]], value[[col]]) | |
.dt[, (col) := NULL] | |
temp_dt[, (col) := val] | |
} | |
for (col in c(names(temp_dt))) { | |
val <- temp_dt[[col]] | |
temp_dt[, (col) := NULL] | |
.dt[, (col) := val] | |
} | |
if (!is.null(k)) data.table::setkeyv(.dt, k) | |
.dt | |
} | |
#' Remove rows from a data table in a memory efficient, by-referencesque manner | |
#' | |
#' This mimics the by-reference functionality `DT[, new_col := NULL]`, but | |
#' for rows instead. This operation preserves order. If the data table is keyed | |
#' it will preserve the key. | |
#' | |
#' This function will determine the rows to delete by passing all additional | |
#' arguments to a data.table filter call of the form | |
#' \code{DT[, .idx = .I][..., j = .idx]} | |
#' Thus we can pass a simple index vector or a condition, or even delete by | |
#' using join syntax \code{data_table_remove_rows(DT1, DT2, on = cols)} (or | |
#' reversely keep by join using | |
#' \code{data_table_remove_rows(DT1, !DT2, on = cols)} | |
#' | |
#' Note that this operation is memory efficient as it will add the rows for | |
#' one column at a time, only requiring reallocation of single columns at a | |
#' time. This function will change the original data table by reference. | |
#' | |
#' This function will not affect shallow copies of the data table. | |
#' | |
#' @param .dt A data table | |
#' @param ... Any arguments passed to `[` for filtering the data.table. Must not | |
#' specify `j`. | |
#' @return \param{.dt} (invisible) | |
data_table_remove_rows <- function(.dt, ...) { | |
k <- data.table::key(.dt) | |
env <- parent.frame() | |
args <- as.list(sys.call()[-1]) | |
if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL | |
else args <- args[-1] | |
if (!is.null(names(args)) && "j" %in% names(args)) { | |
rlang::abort("... must not specify j") | |
} | |
call <- substitute( | |
.dt[, .idx := .I][j = .idx], | |
env = list(.dt = .dt)) | |
.nc <- names(call) | |
for (i in seq_along(args)) { | |
call[[i + 3]] <- args[[i]] | |
} | |
if (!is.null(names(args))) names(call) <- c(.nc, names(args)) | |
which <- eval(call, envir = env) | |
.dt[, .idx := NULL] | |
which <- sort(setdiff(seq_len(nrow(.dt)), which)) | |
temp_dt <- data.table::data.table() | |
for (col in c(names(.dt))) { | |
val <- .dt[[col]][which] | |
.dt[, (col) := NULL] | |
temp_dt[, (col) := val] | |
} | |
for (col in c(names(temp_dt))) { | |
val <- temp_dt[[col]] | |
temp_dt[, (col) := NULL] | |
.dt[, (col) := val] | |
} | |
if (!is.null(k)) data.table::setattr(.dt, "sorted", k) | |
.dt | |
} | |
# https://stackoverflow.com/questions/32934933/faster-in-operator | |
`%fin%` <- function(x, table) { | |
stopifnot(require(fastmatch)) | |
fmatch(x, table, nomatch = 0L) > 0L | |
} | |
optimized_data_table_add_rows <- function(.dt, value) { | |
if (length(value) > ncol(.dt)) { | |
rlang::abort(glue::glue("Trying to update data table with {ncol(.dt) | |
} columns with {length(value)} columns.")) | |
} | |
if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))] | |
setDT(value) | |
if (any(!(names(value) %in% names(.dt)))) { | |
rlang::abort(glue::glue("Trying to update data table with columns { | |
paste(setdiff(names(value), names(.dt)), collapse = ', ') | |
} not present in original data table.")) | |
} | |
missingCols <- setdiff(names(.dt), names(value)) | |
for (col in missingCols) { | |
set(value, j = col,value = NA) | |
} | |
k <- data.table::key(.dt) | |
temp_dt <- data.table::data.table() | |
for (col in c(names(.dt))) { | |
set(temp_dt, j = col,value = c(.dt[[col]], value[[col]])) | |
set(.dt,j = col, value = NULL) | |
} | |
for (col in c(names(temp_dt))) { | |
set(.dt,j = col, value = temp_dt[[col]]) | |
} | |
if (!is.null(k)) data.table::setkeyv(.dt, k) | |
.dt | |
} | |
optimized_data_table_remove_rows <- function(.dt, ...) { | |
k <- data.table::key(.dt) | |
env <- parent.frame() | |
args <- as.list(sys.call()[-1]) | |
if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL | |
else args <- args[-1] | |
if (!is.null(names(args)) && "j" %in% names(args)) { | |
rlang::abort("... must not specify j") | |
} | |
call <- substitute( | |
.dt[, .idx := .I][j = .idx], | |
env = list(.dt = .dt)) | |
.nc <- names(call) | |
for (i in seq_along(args)) { | |
call[[i + 3]] <- args[[i]] | |
} | |
if (!is.null(names(args))) names(call) <- c(.nc, names(args)) | |
which <- eval(call, envir = env) | |
set(.dt,j = ".idx", value = NULL) | |
idx <- seq_len(nrow(.dt)) | |
which <- sort(idx[!idx %fin% which]) | |
temp_dt <- data.table::data.table() | |
for (col in c(names(.dt))) { | |
set(temp_dt, j = col,value = .dt[[col]][which]) | |
set(.dt,j = col, value = NULL) | |
} | |
for (col in c(names(temp_dt))) { | |
set(.dt,j = col, value = temp_dt[[col]]) | |
} | |
if (!is.null(k)) data.table::setattr(.dt, "sorted", k) | |
.dt | |
} | |
# Generate Data ----------------------------------------------------------- | |
## Number of rows in synthetic data | |
N = 1e7 | |
## Larger date ranges and mean duration will impact resulting | |
## intermediate result sizes significantly | |
start <- as.Date("2020-01-01") | |
end <- as.Date("2022-12-31") | |
range <- seq.Date(from = start, | |
to = end, | |
by = 1) | |
meanDuration <- 60 | |
sdDuration <- meanDuration/2 | |
## Cardinality of groups will also impact results | |
gr1Set <- as.character(seq_len(100)) | |
gr2Set <- as.character(seq_len(20)) | |
dt <- data.table( | |
gr1 = sample(gr1Set, N, replace = T), | |
gr2 = sample(gr2Set, N, replace = T), | |
date1 = sample(range, N, replace = T), | |
date2 = as.Date(NA), | |
value = sample(1:10, N, replace = T) | |
)[order(gr1,gr2,date1)] | |
dt[, date2 := date1 + ceiling(pmax(1,rnorm(N, mean = meanDuration,sd = sdDuration)))] | |
# Removing Rows ----------------------------------------------------------- | |
x <- copy(dt) | |
y <- copy(dt) | |
z <- copy(dt) | |
profvis::profvis({ | |
data_table_remove_rows(x, gr1 == "1") | |
optimized_data_table_remove_rows(x, gr1 == "1") | |
y <- y[!gr1 == "1"] | |
}) | |
# Adding Rows ------------------------------------------------------------- | |
u <- copy(dt) | |
v <- copy(dt) | |
w <- copy(dt) | |
N = 1e3 | |
newlist <- list( | |
gr1 = sample(gr1Set, N, replace = T), | |
gr2 = sample(gr2Set, N, replace = T), | |
date1 = sample(range, N, replace = T), | |
date2 = rep(as.Date(NA),N), | |
value = sample(1:10, N, replace = T) | |
) | |
newlist2 <- list( | |
gr1 = sample(gr1Set, N, replace = T), | |
gr2 = sample(gr2Set, N, replace = T), | |
date1 = sample(range, N, replace = T), | |
date2 = rep(as.Date(NA),N), | |
value = sample(1:10, N, replace = T) | |
) | |
newdt <- data.table( | |
gr1 = sample(gr1Set, N, replace = T), | |
gr2 = sample(gr2Set, N, replace = T), | |
date1 = sample(range, N, replace = T), | |
date2 = rep(as.Date(NA),N), | |
value = sample(1:10, N, replace = T) | |
) | |
profvis::profvis({ | |
data_table_add_rows(u,newlist) | |
optimized_data_table_add_rows(v,newlist2) | |
v <- rbindlist(list(v,newdt)) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment