msummersgill · January 13, 2023 23:28
diff --git a/rows_by_reference.R b/rows_by_reference.R
 library(data.table)
 library(fastmatch)

 #' Add rows to a data table in a memory efficient, by-referencesque manner
 #'
 #' This mimics the by-reference functionality `DT[, new_col := value]`, but
 #' for rows instead. The rows in question are assigned at the end of the data
 #' table. If the data table is keyed it is automatically reordered after the
 #' operation. If not this function will preserve order of existing rows, but
 #' will not preserve sortedness.
 #'
 #' This function will take the rows to add from a list of columns or generally
 #' anything that can be named and converted or coerced to data frame.
 #' The list may specify less columns than present in the data table. In this
 #' case the rest is filled with NA. The list may not specify more columns than
 #' present in the data table. Columns are matched by names if the list is named
 #' or by position if not. The list may not have names not present in the data
 #' table.
 #'
 #' Note that this operation is memory efficient as it will add the rows for
 #' one column at a time, only requiring reallocation of single columns at a
 #' time. This function will change the original data table by reference.
 #'
 #' This function will not affect shallow copies of the data table.
 #'
 #' @param .dt A data table
 #' @param value A list (or a data frame). Must have at most as many elements as
 #'        there are columns in \param{.dt}. If unnamed this will be applied to
 #'        first columns in \param{.dt}, else it will by applied by name. Must
 #'        not have names not present in \param{.dt}.
 #' @return \param{.dt} (invisible)
 data_table_add_rows <- function(.dt, value) {
  if (length(value) > ncol(.dt)) {
    rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
      } columns with {length(value)} columns."))
  }
  if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]
  value <- as.data.frame(value)
  if (any(!(names(value) %in% names(.dt)))) {
    rlang::abort(glue::glue("Trying to update data table with columns {
        paste(setdiff(names(value), names(.dt)), collapse = ', ')
      } not present in original data table."))
  }
  value[setdiff(names(.dt), names(value))] <- NA
  
  k <- data.table::key(.dt)
  
  temp_dt <- data.table::data.table()
  
  for (col in c(names(.dt))) {
    val <- c(.dt[[col]], value[[col]])
    .dt[, (col) := NULL]
    temp_dt[, (col) := val]
  }
  
  for (col in c(names(temp_dt))) {
    val <- temp_dt[[col]]
    temp_dt[, (col) := NULL]
    .dt[, (col) := val]
  }
  
  if (!is.null(k)) data.table::setkeyv(.dt, k)
  
  .dt
 }

 #' Remove rows from a data table in a memory efficient, by-referencesque manner
 #'
 #' This mimics the by-reference functionality `DT[, new_col := NULL]`, but
 #' for rows instead. This operation preserves order. If the data table is keyed
 #' it will preserve the key.
 #'
 #' This function will determine the rows to delete by passing all additional
 #' arguments to a data.table filter call of the form
 #' \code{DT[, .idx = .I][..., j = .idx]}
 #' Thus we can pass a simple index vector or a condition, or even delete by
 #' using join syntax \code{data_table_remove_rows(DT1, DT2, on = cols)} (or
 #' reversely keep by join using
 #' \code{data_table_remove_rows(DT1, !DT2, on = cols)}
 #'
 #' Note that this operation is memory efficient as it will add the rows for
 #' one column at a time, only requiring reallocation of single columns at a
 #' time. This function will change the original data table by reference.
 #'
 #' This function will not affect shallow copies of the data table.
 #'
 #' @param .dt A data table
 #' @param ... Any arguments passed to `[` for filtering the data.table. Must not
 #'        specify `j`.
 #' @return \param{.dt} (invisible)
 data_table_remove_rows <- function(.dt, ...) {
  k <- data.table::key(.dt)
  
  env <- parent.frame()
  args <- as.list(sys.call()[-1])
  if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
  else args <- args[-1]
  
  if (!is.null(names(args)) && "j" %in% names(args)) {
    rlang::abort("... must not specify j")
  }
  
  call <- substitute(
    .dt[, .idx := .I][j = .idx],
    env = list(.dt = .dt))
  
  .nc <- names(call)
  
  for (i in seq_along(args)) {
    call[[i + 3]] <- args[[i]]
  }
  
  if (!is.null(names(args))) names(call) <- c(.nc, names(args))
  which <- eval(call, envir = env)
  .dt[, .idx := NULL]
  
  which <- sort(setdiff(seq_len(nrow(.dt)), which))
  
  temp_dt <- data.table::data.table()
  
  for (col in c(names(.dt))) {
    val <- .dt[[col]][which]
    .dt[, (col) := NULL]
    temp_dt[, (col) := val]
  }
  
  for (col in c(names(temp_dt))) {
    val <- temp_dt[[col]]
    temp_dt[, (col) := NULL]
    .dt[, (col) := val]
  }
  
  if (!is.null(k)) data.table::setattr(.dt, "sorted", k)
  
  .dt
 }


 # https://stackoverflow.com/questions/32934933/faster-in-operator
 `%fin%` <- function(x, table) {
  stopifnot(require(fastmatch))
  fmatch(x, table, nomatch = 0L) > 0L
 }



 optimized_data_table_add_rows <- function(.dt, value) {
  if (length(value) > ncol(.dt)) {
    rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
      } columns with {length(value)} columns."))
  }
  if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]

  setDT(value)
  if (any(!(names(value) %in% names(.dt)))) {
    rlang::abort(glue::glue("Trying to update data table with columns {
        paste(setdiff(names(value), names(.dt)), collapse = ', ')
      } not present in original data table."))
  }
  
  missingCols <- setdiff(names(.dt), names(value))
  for (col in missingCols) {
    set(value, j = col,value = NA)
  }
  
  k <- data.table::key(.dt)
  
  temp_dt <- data.table::data.table()
  
  for (col in c(names(.dt))) {
    set(temp_dt, j = col,value = c(.dt[[col]], value[[col]]))
    set(.dt,j = col, value = NULL)
  }
  
  for (col in c(names(temp_dt))) {
    set(.dt,j = col, value = temp_dt[[col]])
  }
  
  if (!is.null(k)) data.table::setkeyv(.dt, k)
  
  .dt
 }


 optimized_data_table_remove_rows <- function(.dt, ...) {
  k <- data.table::key(.dt)
  
  env <- parent.frame()
  args <- as.list(sys.call()[-1])
  if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
  else args <- args[-1]
  
  if (!is.null(names(args)) && "j" %in% names(args)) {
    rlang::abort("... must not specify j")
  }
  
  call <- substitute(
    .dt[, .idx := .I][j = .idx],
    env = list(.dt = .dt))
  
  .nc <- names(call)
  
  for (i in seq_along(args)) {
    call[[i + 3]] <- args[[i]]
  }
  
  if (!is.null(names(args))) names(call) <- c(.nc, names(args))
  which <- eval(call, envir = env)

  set(.dt,j = ".idx", value = NULL)
  idx <- seq_len(nrow(.dt))
  which <- sort(idx[!idx %fin% which])
  
  temp_dt <- data.table::data.table()
  
  for (col in c(names(.dt))) {
    set(temp_dt, j = col,value = .dt[[col]][which])
    set(.dt,j = col, value = NULL)
  }
  
  for (col in c(names(temp_dt))) {
    set(.dt,j = col, value = temp_dt[[col]])
  }
  
  if (!is.null(k)) data.table::setattr(.dt, "sorted", k)
  
  .dt
 }



 # Generate Data -----------------------------------------------------------

 ## Number of rows in synthetic data
 N = 1e7

 ## Larger date ranges and mean duration will impact resulting
 ## intermediate result sizes significantly
 start <- as.Date("2020-01-01")
 end <- as.Date("2022-12-31")
 range <- seq.Date(from = start,
                  to = end,
                  by = 1)
 meanDuration <- 60
 sdDuration <- meanDuration/2

 ## Cardinality of groups will also impact results
 gr1Set <- as.character(seq_len(100))
 gr2Set <- as.character(seq_len(20))

 dt <- data.table(
  gr1 = sample(gr1Set, N, replace = T),
  gr2 = sample(gr2Set, N, replace = T),
  date1 = sample(range, N, replace = T),
  date2 = as.Date(NA),
  value = sample(1:10, N, replace = T)
 )[order(gr1,gr2,date1)]

 dt[, date2 := date1 + ceiling(pmax(1,rnorm(N, mean = meanDuration,sd = sdDuration)))]


 # Removing Rows -----------------------------------------------------------

 x <- copy(dt)
 y <- copy(dt)
 z <- copy(dt)

 profvis::profvis({

  data_table_remove_rows(x, gr1 == "1")
  
  optimized_data_table_remove_rows(x, gr1 == "1")
  
  y <- y[!gr1 == "1"]
  
 })


 # Adding Rows -------------------------------------------------------------

 u <- copy(dt)
 v <- copy(dt)
 w <- copy(dt)

 N = 1e3
 newlist <- list(
  gr1 = sample(gr1Set, N, replace = T),
  gr2 = sample(gr2Set, N, replace = T),
  date1 = sample(range, N, replace = T),
  date2 = rep(as.Date(NA),N),
  value = sample(1:10, N, replace = T)
 )

 newlist2 <- list(
  gr1 = sample(gr1Set, N, replace = T),
  gr2 = sample(gr2Set, N, replace = T),
  date1 = sample(range, N, replace = T),
  date2 = rep(as.Date(NA),N),
  value = sample(1:10, N, replace = T)
 )

 newdt <- data.table(
  gr1 = sample(gr1Set, N, replace = T),
  gr2 = sample(gr2Set, N, replace = T),
  date1 = sample(range, N, replace = T),
  date2 = rep(as.Date(NA),N),
  value = sample(1:10, N, replace = T)
 )


 profvis::profvis({
  
  data_table_add_rows(u,newlist)
  
  optimized_data_table_add_rows(v,newlist2)
  
  v <- rbindlist(list(v,newdt))
  
 })
	library(data.table)
	library(fastmatch)

	#' Add rows to a data table in a memory efficient, by-referencesque manner
	#'
	#' This mimics the by-reference functionality `DT[, new_col := value]`, but
	#' for rows instead. The rows in question are assigned at the end of the data
	#' table. If the data table is keyed it is automatically reordered after the
	#' operation. If not this function will preserve order of existing rows, but
	#' will not preserve sortedness.
	#'
	#' This function will take the rows to add from a list of columns or generally
	#' anything that can be named and converted or coerced to data frame.
	#' The list may specify less columns than present in the data table. In this
	#' case the rest is filled with NA. The list may not specify more columns than
	#' present in the data table. Columns are matched by names if the list is named
	#' or by position if not. The list may not have names not present in the data
	#' table.
	#'
	#' Note that this operation is memory efficient as it will add the rows for
	#' one column at a time, only requiring reallocation of single columns at a
	#' time. This function will change the original data table by reference.
	#'
	#' This function will not affect shallow copies of the data table.
	#'
	#' @param .dt A data table
	#' @param value A list (or a data frame). Must have at most as many elements as
	#' there are columns in \param{.dt}. If unnamed this will be applied to
	#' first columns in \param{.dt}, else it will by applied by name. Must
	#' not have names not present in \param{.dt}.
	#' @return \param{.dt} (invisible)
	data_table_add_rows <- function(.dt, value) {
	if (length(value) > ncol(.dt)) {
	rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
	} columns with {length(value)} columns."))
	}
	if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]
	value <- as.data.frame(value)
	if (any(!(names(value) %in% names(.dt)))) {
	rlang::abort(glue::glue("Trying to update data table with columns {
	paste(setdiff(names(value), names(.dt)), collapse = ', ')
	} not present in original data table."))
	}
	value[setdiff(names(.dt), names(value))] <- NA

	k <- data.table::key(.dt)

	temp_dt <- data.table::data.table()

	for (col in c(names(.dt))) {
	val <- c(.dt[[col]], value[[col]])
	.dt[, (col) := NULL]
	temp_dt[, (col) := val]
	}

	for (col in c(names(temp_dt))) {
	val <- temp_dt[[col]]
	temp_dt[, (col) := NULL]
	.dt[, (col) := val]
	}

	if (!is.null(k)) data.table::setkeyv(.dt, k)

	.dt
	}

	#' Remove rows from a data table in a memory efficient, by-referencesque manner
	#'
	#' This mimics the by-reference functionality `DT[, new_col := NULL]`, but
	#' for rows instead. This operation preserves order. If the data table is keyed
	#' it will preserve the key.
	#'
	#' This function will determine the rows to delete by passing all additional
	#' arguments to a data.table filter call of the form
	#' \code{DT[, .idx = .I][..., j = .idx]}
	#' Thus we can pass a simple index vector or a condition, or even delete by
	#' using join syntax \code{data_table_remove_rows(DT1, DT2, on = cols)} (or
	#' reversely keep by join using
	#' \code{data_table_remove_rows(DT1, !DT2, on = cols)}
	#'
	#' Note that this operation is memory efficient as it will add the rows for
	#' one column at a time, only requiring reallocation of single columns at a
	#' time. This function will change the original data table by reference.
	#'
	#' This function will not affect shallow copies of the data table.
	#'
	#' @param .dt A data table
	#' @param ... Any arguments passed to `[` for filtering the data.table. Must not
	#' specify `j`.
	#' @return \param{.dt} (invisible)
	data_table_remove_rows <- function(.dt, ...) {
	k <- data.table::key(.dt)

	env <- parent.frame()
	args <- as.list(sys.call()[-1])
	if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
	else args <- args[-1]

	if (!is.null(names(args)) && "j" %in% names(args)) {
	rlang::abort("... must not specify j")
	}

	call <- substitute(
	.dt[, .idx := .I][j = .idx],
	env = list(.dt = .dt))

	.nc <- names(call)

	for (i in seq_along(args)) {
	call[[i + 3]] <- args[[i]]
	}

	if (!is.null(names(args))) names(call) <- c(.nc, names(args))
	which <- eval(call, envir = env)
	.dt[, .idx := NULL]

	which <- sort(setdiff(seq_len(nrow(.dt)), which))

	temp_dt <- data.table::data.table()

	for (col in c(names(.dt))) {
	val <- .dt[[col]][which]
	.dt[, (col) := NULL]
	temp_dt[, (col) := val]
	}

	for (col in c(names(temp_dt))) {
	val <- temp_dt[[col]]
	temp_dt[, (col) := NULL]
	.dt[, (col) := val]
	}

	if (!is.null(k)) data.table::setattr(.dt, "sorted", k)

	.dt
	}


	# https://stackoverflow.com/questions/32934933/faster-in-operator
	`%fin%` <- function(x, table) {
	stopifnot(require(fastmatch))
	fmatch(x, table, nomatch = 0L) > 0L
	}



	optimized_data_table_add_rows <- function(.dt, value) {
	if (length(value) > ncol(.dt)) {
	rlang::abort(glue::glue("Trying to update data table with {ncol(.dt)
	} columns with {length(value)} columns."))
	}
	if (is.null(names(value))) names(value) <- names(.dt)[seq_len(length(value))]

	setDT(value)
	if (any(!(names(value) %in% names(.dt)))) {
	rlang::abort(glue::glue("Trying to update data table with columns {
	paste(setdiff(names(value), names(.dt)), collapse = ', ')
	} not present in original data table."))
	}

	missingCols <- setdiff(names(.dt), names(value))
	for (col in missingCols) {
	set(value, j = col,value = NA)
	}

	k <- data.table::key(.dt)

	temp_dt <- data.table::data.table()

	for (col in c(names(.dt))) {
	set(temp_dt, j = col,value = c(.dt[[col]], value[[col]]))
	set(.dt,j = col, value = NULL)
	}

	for (col in c(names(temp_dt))) {
	set(.dt,j = col, value = temp_dt[[col]])
	}

	if (!is.null(k)) data.table::setkeyv(.dt, k)

	.dt
	}


	optimized_data_table_remove_rows <- function(.dt, ...) {
	k <- data.table::key(.dt)

	env <- parent.frame()
	args <- as.list(sys.call()[-1])
	if (!is.null(names(args)) && ".dt" %in% names(args)) args[.dt] <- NULL
	else args <- args[-1]

	if (!is.null(names(args)) && "j" %in% names(args)) {
	rlang::abort("... must not specify j")
	}

	call <- substitute(
	.dt[, .idx := .I][j = .idx],
	env = list(.dt = .dt))

	.nc <- names(call)

	for (i in seq_along(args)) {
	call[[i + 3]] <- args[[i]]
	}

	if (!is.null(names(args))) names(call) <- c(.nc, names(args))
	which <- eval(call, envir = env)

	set(.dt,j = ".idx", value = NULL)
	idx <- seq_len(nrow(.dt))
	which <- sort(idx[!idx %fin% which])

	temp_dt <- data.table::data.table()

	for (col in c(names(.dt))) {
	set(temp_dt, j = col,value = .dt[[col]][which])
	set(.dt,j = col, value = NULL)
	}

	for (col in c(names(temp_dt))) {
	set(.dt,j = col, value = temp_dt[[col]])
	}

	if (!is.null(k)) data.table::setattr(.dt, "sorted", k)

	.dt
	}



	# Generate Data -----------------------------------------------------------

	## Number of rows in synthetic data
	N = 1e7

	## Larger date ranges and mean duration will impact resulting
	## intermediate result sizes significantly
	start <- as.Date("2020-01-01")
	end <- as.Date("2022-12-31")
	range <- seq.Date(from = start,
	to = end,
	by = 1)
	meanDuration <- 60
	sdDuration <- meanDuration/2

	## Cardinality of groups will also impact results
	gr1Set <- as.character(seq_len(100))
	gr2Set <- as.character(seq_len(20))

	dt <- data.table(
	gr1 = sample(gr1Set, N, replace = T),
	gr2 = sample(gr2Set, N, replace = T),
	date1 = sample(range, N, replace = T),
	date2 = as.Date(NA),
	value = sample(1:10, N, replace = T)
	)[order(gr1,gr2,date1)]

	dt[, date2 := date1 + ceiling(pmax(1,rnorm(N, mean = meanDuration,sd = sdDuration)))]


	# Removing Rows -----------------------------------------------------------

	x <- copy(dt)
	y <- copy(dt)
	z <- copy(dt)

	profvis::profvis({

	data_table_remove_rows(x, gr1 == "1")

	optimized_data_table_remove_rows(x, gr1 == "1")

	y <- y[!gr1 == "1"]

	})


	# Adding Rows -------------------------------------------------------------

	u <- copy(dt)
	v <- copy(dt)
	w <- copy(dt)

	N = 1e3
	newlist <- list(
	gr1 = sample(gr1Set, N, replace = T),
	gr2 = sample(gr2Set, N, replace = T),
	date1 = sample(range, N, replace = T),
	date2 = rep(as.Date(NA),N),
	value = sample(1:10, N, replace = T)
	)

	newlist2 <- list(
	gr1 = sample(gr1Set, N, replace = T),
	gr2 = sample(gr2Set, N, replace = T),
	date1 = sample(range, N, replace = T),
	date2 = rep(as.Date(NA),N),
	value = sample(1:10, N, replace = T)
	)

	newdt <- data.table(
	gr1 = sample(gr1Set, N, replace = T),
	gr2 = sample(gr2Set, N, replace = T),
	date1 = sample(range, N, replace = T),
	date2 = rep(as.Date(NA),N),
	value = sample(1:10, N, replace = T)
	)


	profvis::profvis({

	data_table_add_rows(u,newlist)

	optimized_data_table_add_rows(v,newlist2)

	v <- rbindlist(list(v,newdt))

	})