-
-
Save wch/0e564def155d976c04dd28a876dc04b4 to your computer and use it in GitHub Desktop.
--- | |
title: "Applying a function over rows of a data frame" | |
author: "Winston Chang" | |
output: html_document | |
editor_options: | |
chunk_output_type: console | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(collapse = TRUE, comment = "#>") | |
``` | |
[Source](https://gist.github.com/wch/0e564def155d976c04dd28a876dc04b4) for this document. | |
@dattali [asked](https://twitter.com/daattali/status/761058049859518464), "what's a safe way to iterate over rows of a data frame?" The example was to convert each row into a list and return a list of lists, indexed first by column, then by row. | |
A number of people gave suggestions on Twitter, which I've collected here. I've benchmarked these methods with data of various sizes; scroll down to see a plot of times. | |
```{r message=FALSE} | |
library(purrr) | |
library(dplyr) | |
library(tidyr) | |
# @dattali | |
# Using apply (only safe when all cols are same type) | |
f_apply <- function(df) { | |
apply(df, 1, function(row) as.list(row)) | |
} | |
# @drob | |
# split + lapply | |
f_split_lapply <- function(df) { | |
df <- split(df, seq_len(nrow(df))) | |
lapply(df, function(row) as.list(row)) | |
} | |
# @winston_chang | |
# lapply over row indices | |
f_lapply_row <- function(df) { | |
lapply(seq_len(nrow(df)), function(i) as.list(df[i,,drop=FALSE])) | |
} | |
# @winston_chang | |
# lapply + lapply: Treat data frame like a list, and the slice out lists | |
f_lapply_lapply <- function(df) { | |
cols <- seq_len(length(df)) | |
names(cols) <- names(df) | |
lapply(seq_len(nrow(df)), function(row) { | |
lapply(cols, function(col) { | |
df[[col]][[row]] | |
}) | |
}) | |
} | |
# @winston_chang | |
# lapply + lapply v2: Same as lapply_lapply, but explicitly convert df to a list | |
f_lapply_lapply2 <- function(df) { | |
rows <- seq_len(nrow(df)) | |
cols <- seq_len(length(df)) | |
names(cols) <- names(df) | |
df <- as.list(df) | |
lapply(rows, function(row) { | |
lapply(cols, function(col) { | |
df[[col]][[row]] | |
}) | |
}) | |
} | |
# @winston_chang | |
# nested_for: Same as lapply_lapply2, but use a for loop instead of lapply() | |
f_nested_for <- function(df) { | |
nrows <- nrow(df) | |
ncols <- length(df) | |
row_idxs <- seq_len(nrows) | |
col_idxs <- seq_len(ncols) | |
colnames <- names(df) | |
df <- as.list(df) | |
res <- vector("list", nrows) | |
for (i in row_idxs) { | |
row <- vector("list", ncols) | |
for (j in col_idxs) { | |
row[[j]] <- df[[j]][[i]] | |
} | |
names(row) <- colnames | |
res[[i]] <- row | |
} | |
res | |
} | |
# @ Tomasz Kalinowski | |
# .mapply | |
f_mapply <- function(df) { | |
.mapply(list, unclass(df), NULL) | |
} | |
# @JennyBryan | |
# purrr::pmap | |
f_pmap <- function(df) { | |
pmap(df, list) | |
} | |
# purrr::list_transpose | |
f_list_transpose <- function(df) { | |
list_transpose(as.list(df)) | |
} | |
# purrr::transpose: This is superseded by list_transpose, but the old version is | |
# much faster. | |
f_transpose <- function(df) { | |
transpose(as.list(df)) | |
} | |
``` | |
Benchmark each of them, using data sets with varying numbers of rows: | |
```{r} | |
run_benchmark <- function(nrow) { | |
# Make some data | |
df <- data.frame( | |
x = rnorm(nrow), | |
y = runif(nrow), | |
z = runif(nrow) | |
) | |
res <- list( | |
apply = system.time(f_apply(df)), | |
split_lapply = system.time(f_split_lapply(df)), | |
lapply_row = system.time(f_lapply_row(df)), | |
lapply_lapply = system.time(f_lapply_lapply(df)), | |
lapply_lapply2 = system.time(f_lapply_lapply2(df)), | |
nested_for = system.time(f_nested_for(df)), | |
mapply = system.time(f_mapply(df)), | |
pmap = system.time(f_pmap(df)), | |
list_transpose = system.time(f_list_transpose(df)), | |
transpose = system.time(f_transpose(df)) | |
) | |
# Get elapsed times | |
res <- lapply(res, `[[`, "elapsed") | |
# Add nrow to front | |
res <- c(nrow = nrow, res) | |
res | |
} | |
# Run the benchmarks for various size data | |
all_times <- lapply(1:5, function(n) { | |
run_benchmark(10^n) | |
}) | |
# Convert to data frame | |
times <- lapply(all_times, as.data.frame) | |
times <- do.call(rbind, times) | |
knitr::kable(times) | |
``` | |
## Plot times | |
This plot shows the number of seconds needed to process n rows, for each method. Both the x and y use log scales, so each step along the x scale represents a 10x increase in number of rows, and each step along the y scale represents a 10x increase in time. | |
```{r message=FALSE} | |
library(ggplot2) | |
library(scales) | |
library(ggrepel) | |
# Convert to long format | |
times_long <- gather(times, method, seconds, -nrow) | |
# Set order of methods, for plots | |
times_long$method <- factor(times_long$method, | |
levels = c("apply", "split_lapply", "lapply_row", | |
"lapply_lapply", "lapply_lapply2", "nested_for", "mapply", | |
"pmap", "list_transpose", "transpose") | |
) | |
# Set up a column for labels | |
times_long$end_label <- sprintf("%s (%0.2fs)", times_long$method, times_long$seconds) | |
times_long$end_label[times_long$nrow != max(times_long$nrow)] <- NA | |
log10_breaks <- trans_breaks("log10", function(x) 10 ^ x) | |
log10_mbreaks <- function(x) { | |
limits <- c(floor(log10(x[1])), ceiling(log10(x[2]))) | |
breaks <- 10 ^ seq(limits[1], limits[2]) | |
unlist(lapply(breaks, function(x) x * seq(0.1, 0.9, by = 0.1))) | |
} | |
log10_labels <- trans_format("log10", math_format(10 ^ .x)) | |
# Plot with log-log axes | |
ggplot(times_long, aes(x = nrow, y = seconds, colour = method)) + | |
geom_point(size = 2) + | |
geom_line(linewidth = 1) + | |
geom_label_repel(aes(label = end_label), point.padding = 1, | |
direction = "y", nudge_x = 1.5) + | |
annotation_logticks(sides = "trbl") + | |
guides(colour = "none") + | |
theme_bw() + | |
scale_y_log10( | |
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks | |
) + | |
scale_x_log10( | |
breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks | |
) | |
``` |
Thanks! It actually takes over twice as long using row=list(.)
instead of row=list(row)
, but on the plus side, row=list(.)
gives the correct result.
According to @hadley, the reason rowwise
is slower than pmap
is:
Because rowwise() is a poorly thought out dumpster fire?
Thanks for doing this. I had an error when replicating. Error in by_row(df, function(row) as.list(row)) : could not find function "by_row"
I added library purrrlyr and the code worked just fine.
@wch probably knows this already, but just for reference -- apparently pmap
will now automatically do the as.list
coercion given a data.frame, so the Jenny solution should now be equivalent in performance to the explicit coercion.
Definitely interesting to see how much these patterns have evolved and deprecated over the past few years alone!
I've just updated this with a lapply_lapply2
function, which is 4x faster than the previous lapply_lapply
function. It's essentially the same, except it converts df
from a data frame to a list. This is because [[.data.frame
is slow compared using [[
on a list.
Updated result at: https://rpubs.com/wch/1008771
I just updated it again with nested_for
. This is roughly the same as lapply_lapply2
, but instead of nested lapply
calls, it uses nested for
loops.
dplyr method should be something like
Won't change run time much, I'm wondering why it's so much slower than purrr.