Skip to content

Instantly share code, notes, and snippets.

@msummersgill
Created January 13, 2023 23:28
Show Gist options
  • Save msummersgill/10a2a25273f2018df946a14b1f755496 to your computer and use it in GitHub Desktop.
Save msummersgill/10a2a25273f2018df946a14b1f755496 to your computer and use it in GitHub Desktop.
library(data.table)
library(fastmatch)
#' Add rows to a data table in a memory efficient, by-referencesque manner
#'
#' This mimics the by-reference functionality `DT[, new_col := value]`, but
#' for rows instead. The rows in question are assigned at the end of the data
#' table. If the data table is keyed it is automatically reordered after the
#' operation. If not this function will preserve order of existing rows, but
#' will not preserve sortedness.
#'
#' This function will take the rows to add from a list of columns or generally
#' anything that can be named and converted or coerced to data frame.
#' The list may specify less columns than present in the data table. In this
#' case the rest is filled with NA. The list may not specify more columns than
#' present in the data table. Columns are matched by names if the list is named
#' or by position if not. The list may not have names not present in the data
#' table.
#'
#' Note that this operation is memory efficient as it will add the rows for
#' one column at a time, only requiring reallocation of single columns at a
#' time. This function will change the original data table by reference.
#'
#' This function will not affect shallow copies of the data table.
#'
#' @param .dt A data table
#' @param value A list (or a data frame). Must have at most as many elements as
#' there are columns in \param{.dt}. If unnamed this will be applied to
#' first columns in \param{.dt}, else it will by applied by name. Must
#' not have names not present in \param{.dt}.
#' @return \param{.dt} (invisible)
data_table_add_rows <- function(.dt, value) {
if (length(value) > ncol(.dt)) {
rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
} columns with {length(value)} columns."))
}
if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]
value <- as.data.frame(value)
if (any(!(names(value) %in% names(.dt)))) {
rlang::abort(glue::glue("Trying to update data table with columns {
paste(setdiff(names(value), names(.dt)), collapse = ', ')
} not present in original data table."))
}
value[setdiff(names(.dt), names(value))] <- NA
k <- data.table::key(.dt)
temp_dt <- data.table::data.table()
for (col in c(names(.dt))) {
val <- c(.dt[[col]], value[[col]])
.dt[, (col) := NULL]
temp_dt[, (col) := val]
}
for (col in c(names(temp_dt))) {
val <- temp_dt[[col]]
temp_dt[, (col) := NULL]
.dt[, (col) := val]
}
if (!is.null(k)) data.table::setkeyv(.dt, k)
.dt
}
#' Remove rows from a data table in a memory efficient, by-referencesque manner
#'
#' This mimics the by-reference functionality `DT[, new_col := NULL]`, but
#' for rows instead. This operation preserves order. If the data table is keyed
#' it will preserve the key.
#'
#' This function will determine the rows to delete by passing all additional
#' arguments to a data.table filter call of the form
#' \code{DT[, .idx = .I][..., j = .idx]}
#' Thus we can pass a simple index vector or a condition, or even delete by
#' using join syntax \code{data_table_remove_rows(DT1, DT2, on = cols)} (or
#' reversely keep by join using
#' \code{data_table_remove_rows(DT1, !DT2, on = cols)}
#'
#' Note that this operation is memory efficient as it will add the rows for
#' one column at a time, only requiring reallocation of single columns at a
#' time. This function will change the original data table by reference.
#'
#' This function will not affect shallow copies of the data table.
#'
#' @param .dt A data table
#' @param ... Any arguments passed to `[` for filtering the data.table. Must not
#' specify `j`.
#' @return \param{.dt} (invisible)
data_table_remove_rows <- function(.dt, ...) {
k <- data.table::key(.dt)
env <- parent.frame()
args <- as.list(sys.call()[-1])
if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
else args <- args[-1]
if (!is.null(names(args)) && "j" %in% names(args)) {
rlang::abort("... must not specify j")
}
call <- substitute(
.dt[, .idx := .I][j = .idx],
env = list(.dt = .dt))
.nc <- names(call)
for (i in seq_along(args)) {
call[[i + 3]] <- args[[i]]
}
if (!is.null(names(args))) names(call) <- c(.nc, names(args))
which <- eval(call, envir = env)
.dt[, .idx := NULL]
which <- sort(setdiff(seq_len(nrow(.dt)), which))
temp_dt <- data.table::data.table()
for (col in c(names(.dt))) {
val <- .dt[[col]][which]
.dt[, (col) := NULL]
temp_dt[, (col) := val]
}
for (col in c(names(temp_dt))) {
val <- temp_dt[[col]]
temp_dt[, (col) := NULL]
.dt[, (col) := val]
}
if (!is.null(k)) data.table::setattr(.dt, "sorted", k)
.dt
}
# https://stackoverflow.com/questions/32934933/faster-in-operator
`%fin%` <- function(x, table) {
stopifnot(require(fastmatch))
fmatch(x, table, nomatch = 0L) > 0L
}
optimized_data_table_add_rows <- function(.dt, value) {
if (length(value) > ncol(.dt)) {
rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
} columns with {length(value)} columns."))
}
if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]
setDT(value)
if (any(!(names(value) %in% names(.dt)))) {
rlang::abort(glue::glue("Trying to update data table with columns {
paste(setdiff(names(value), names(.dt)), collapse = ', ')
} not present in original data table."))
}
missingCols <- setdiff(names(.dt), names(value))
for (col in missingCols) {
set(value, j = col,value = NA)
}
k <- data.table::key(.dt)
temp_dt <- data.table::data.table()
for (col in c(names(.dt))) {
set(temp_dt, j = col,value = c(.dt[[col]], value[[col]]))
set(.dt,j = col, value = NULL)
}
for (col in c(names(temp_dt))) {
set(.dt,j = col, value = temp_dt[[col]])
}
if (!is.null(k)) data.table::setkeyv(.dt, k)
.dt
}
optimized_data_table_remove_rows <- function(.dt, ...) {
k <- data.table::key(.dt)
env <- parent.frame()
args <- as.list(sys.call()[-1])
if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
else args <- args[-1]
if (!is.null(names(args)) && "j" %in% names(args)) {
rlang::abort("... must not specify j")
}
call <- substitute(
.dt[, .idx := .I][j = .idx],
env = list(.dt = .dt))
.nc <- names(call)
for (i in seq_along(args)) {
call[[i + 3]] <- args[[i]]
}
if (!is.null(names(args))) names(call) <- c(.nc, names(args))
which <- eval(call, envir = env)
set(.dt,j = ".idx", value = NULL)
idx <- seq_len(nrow(.dt))
which <- sort(idx[!idx %fin% which])
temp_dt <- data.table::data.table()
for (col in c(names(.dt))) {
set(temp_dt, j = col,value = .dt[[col]][which])
set(.dt,j = col, value = NULL)
}
for (col in c(names(temp_dt))) {
set(.dt,j = col, value = temp_dt[[col]])
}
if (!is.null(k)) data.table::setattr(.dt, "sorted", k)
.dt
}
# Generate Data -----------------------------------------------------------
## Number of rows in synthetic data
N = 1e7
## Larger date ranges and mean duration will impact resulting
## intermediate result sizes significantly
start <- as.Date("2020-01-01")
end <- as.Date("2022-12-31")
range <- seq.Date(from = start,
to = end,
by = 1)
meanDuration <- 60
sdDuration <- meanDuration/2
## Cardinality of groups will also impact results
gr1Set <- as.character(seq_len(100))
gr2Set <- as.character(seq_len(20))
dt <- data.table(
gr1 = sample(gr1Set, N, replace = T),
gr2 = sample(gr2Set, N, replace = T),
date1 = sample(range, N, replace = T),
date2 = as.Date(NA),
value = sample(1:10, N, replace = T)
)[order(gr1,gr2,date1)]
dt[, date2 := date1 + ceiling(pmax(1,rnorm(N, mean = meanDuration,sd = sdDuration)))]
# Removing Rows -----------------------------------------------------------
x <- copy(dt)
y <- copy(dt)
z <- copy(dt)
profvis::profvis({
data_table_remove_rows(x, gr1 == "1")
optimized_data_table_remove_rows(x, gr1 == "1")
y <- y[!gr1 == "1"]
})
# Adding Rows -------------------------------------------------------------
u <- copy(dt)
v <- copy(dt)
w <- copy(dt)
N = 1e3
newlist <- list(
gr1 = sample(gr1Set, N, replace = T),
gr2 = sample(gr2Set, N, replace = T),
date1 = sample(range, N, replace = T),
date2 = rep(as.Date(NA),N),
value = sample(1:10, N, replace = T)
)
newlist2 <- list(
gr1 = sample(gr1Set, N, replace = T),
gr2 = sample(gr2Set, N, replace = T),
date1 = sample(range, N, replace = T),
date2 = rep(as.Date(NA),N),
value = sample(1:10, N, replace = T)
)
newdt <- data.table(
gr1 = sample(gr1Set, N, replace = T),
gr2 = sample(gr2Set, N, replace = T),
date1 = sample(range, N, replace = T),
date2 = rep(as.Date(NA),N),
value = sample(1:10, N, replace = T)
)
profvis::profvis({
data_table_add_rows(u,newlist)
optimized_data_table_add_rows(v,newlist2)
v <- rbindlist(list(v,newdt))
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment