Last active
February 28, 2023 03:35
-
-
Save wch/0e564def155d976c04dd28a876dc04b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Applying a function over rows of a data frame" | |
author: "Winston Chang" | |
output: html_document | |
editor_options: | |
chunk_output_type: console | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(collapse = TRUE, comment = "#>") | |
``` | |
[Source](https://gist.github.com/wch/0e564def155d976c04dd28a876dc04b4) for this document. | |
@dattali [asked](https://twitter.com/daattali/status/761058049859518464), "what's a safe way to iterate over rows of a data frame?" The example was to convert each row into a list and return a list of lists, indexed first by column, then by row. | |
A number of people gave suggestions on Twitter, which I've collected here. I've benchmarked these methods with data of various sizes; scroll down to see a plot of times. | |
```{r message=FALSE} | |
library(purrr) | |
library(dplyr) | |
library(tidyr) | |
# @dattali | |
# Using apply (only safe when all cols are same type) | |
f_apply <- function(df) { | |
apply(df, 1, function(row) as.list(row)) | |
} | |
# @drob | |
# split + lapply | |
f_split_lapply <- function(df) { | |
df <- split(df, seq_len(nrow(df))) | |
lapply(df, function(row) as.list(row)) | |
} | |
# @winston_chang | |
# lapply over row indices | |
f_lapply_row <- function(df) { | |
lapply(seq_len(nrow(df)), function(i) as.list(df[i,,drop=FALSE])) | |
} | |
# @winston_chang | |
# lapply + lapply: Treat data frame like a list, and the slice out lists | |
f_lapply_lapply <- function(df) { | |
cols <- seq_len(length(df)) | |
names(cols) <- names(df) | |
lapply(seq_len(nrow(df)), function(row) { | |
lapply(cols, function(col) { | |
df[[col]][[row]] | |
}) | |
}) | |
} | |
# @winston_chang | |
# lapply + lapply v2: Same as lapply_lapply, but explicitly convert df to a list | |
f_lapply_lapply2 <- function(df) { | |
rows <- seq_len(nrow(df)) | |
cols <- seq_len(length(df)) | |
names(cols) <- names(df) | |
df <- as.list(df) | |
lapply(rows, function(row) { | |
lapply(cols, function(col) { | |
df[[col]][[row]] | |
}) | |
}) | |
} | |
# @winston_chang | |
# nested_for: Same as lapply_lapply2, but use a for loop instead of lapply() | |
f_nested_for <- function(df) { | |
nrows <- nrow(df) | |
ncols <- length(df) | |
row_idxs <- seq_len(nrows) | |
col_idxs <- seq_len(ncols) | |
colnames <- names(df) | |
df <- as.list(df) | |
res <- vector("list", nrows) | |
for (i in row_idxs) { | |
row <- vector("list", ncols) | |
for (j in col_idxs) { | |
row[[j]] <- df[[j]][[i]] | |
} | |
names(row) <- colnames | |
res[[i]] <- row | |
} | |
res | |
} | |
# @ Tomasz Kalinowski | |
# .mapply | |
f_mapply <- function(df) { | |
.mapply(list, unclass(df), NULL) | |
} | |
# @JennyBryan | |
# purrr::pmap | |
f_pmap <- function(df) { | |
pmap(df, list) | |
} | |
# purrr::list_transpose | |
f_list_transpose <- function(df) { | |
list_transpose(as.list(df)) | |
} | |
# purrr::transpose: This is superseded by list_transpose, but the old version is | |
# much faster. | |
f_transpose <- function(df) { | |
transpose(as.list(df)) | |
} | |
``` | |
Benchmark each of them, using data sets with varying numbers of rows: | |
```{r} | |
run_benchmark <- function(nrow) { | |
# Make some data | |
df <- data.frame( | |
x = rnorm(nrow), | |
y = runif(nrow), | |
z = runif(nrow) | |
) | |
res <- list( | |
apply = system.time(f_apply(df)), | |
split_lapply = system.time(f_split_lapply(df)), | |
lapply_row = system.time(f_lapply_row(df)), | |
lapply_lapply = system.time(f_lapply_lapply(df)), | |
lapply_lapply2 = system.time(f_lapply_lapply2(df)), | |
nested_for = system.time(f_nested_for(df)), | |
mapply = system.time(f_mapply(df)), | |
pmap = system.time(f_pmap(df)), | |
list_transpose = system.time(f_list_transpose(df)), | |
transpose = system.time(f_transpose(df)) | |
) | |
# Get elapsed times | |
res <- lapply(res, `[[`, "elapsed") | |
# Add nrow to front | |
res <- c(nrow = nrow, res) | |
res | |
} | |
# Run the benchmarks for various size data | |
all_times <- lapply(1:5, function(n) { | |
run_benchmark(10^n) | |
}) | |
# Convert to data frame | |
times <- lapply(all_times, as.data.frame) | |
times <- do.call(rbind, times) | |
knitr::kable(times) | |
``` | |
## Plot times | |
This plot shows the number of seconds needed to process n rows, for each method. Both the x and y use log scales, so each step along the x scale represents a 10x increase in number of rows, and each step along the y scale represents a 10x increase in time. | |
```{r message=FALSE} | |
library(ggplot2) | |
library(scales) | |
library(ggrepel) | |
# Convert to long format | |
times_long <- gather(times, method, seconds, -nrow) | |
# Set order of methods, for plots | |
times_long$method <- factor(times_long$method, | |
levels = c("apply", "split_lapply", "lapply_row", | |
"lapply_lapply", "lapply_lapply2", "nested_for", "mapply", | |
"pmap", "list_transpose", "transpose") | |
) | |
# Set up a column for labels | |
times_long$end_label <- sprintf("%s (%0.2fs)", times_long$method, times_long$seconds) | |
times_long$end_label[times_long$nrow != max(times_long$nrow)] <- NA | |
log10_breaks <- trans_breaks("log10", function(x) 10 ^ x) | |
log10_mbreaks <- function(x) { | |
limits <- c(floor(log10(x[1])), ceiling(log10(x[2]))) | |
breaks <- 10 ^ seq(limits[1], limits[2]) | |
unlist(lapply(breaks, function(x) x * seq(0.1, 0.9, by = 0.1))) | |
} | |
log10_labels <- trans_format("log10", math_format(10 ^ .x)) | |
# Plot with log-log axes | |
ggplot(times_long, aes(x = nrow, y = seconds, colour = method)) + | |
geom_point(size = 2) + | |
geom_line(linewidth = 1) + | |
geom_label_repel(aes(label = end_label), point.padding = 1, | |
direction = "y", nudge_x = 1.5) + | |
annotation_logticks(sides = "trbl") + | |
guides(colour = "none") + | |
theme_bw() + | |
scale_y_log10( | |
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks | |
) + | |
scale_x_log10( | |
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks | |
) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I just updated it again with
nested_for
. This is roughly the same aslapply_lapply2
, but instead of nestedlapply
calls, it uses nestedfor
loops.