wch · February 28, 2023 03:35 · wch · Feb 28, 2023
diff --git a/rowwise.Rmd b/rowwise.Rmd
 ---
 title: "Applying a function over rows of a data frame"
 author: "Winston Chang"
 output: html_document
 editor_options: 
  chunk_output_type: console
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
 ```

 [Source](https://gist.github.com/wch/0e564def155d976c04dd28a876dc04b4) for this document.

 @dattali [asked](https://twitter.com/daattali/status/761058049859518464), "what's a safe way to iterate over rows of a data frame?" The example was to convert each row into a list and return a list of lists, indexed first by column, then by row.

 A number of people gave suggestions on Twitter, which I've collected here. I've benchmarked these methods with data of various sizes; scroll down to see a plot of times.

 ```{r message=FALSE}
 library(purrr)
 library(dplyr)
 library(tidyr)

 # @dattali
 # Using apply (only safe when all cols are same type)
 f_apply <- function(df) {
  apply(df, 1, function(row) as.list(row))  
 }

 # @drob
 # split + lapply
 f_split_lapply <- function(df) {
  df <- split(df, seq_len(nrow(df)))
  lapply(df, function(row) as.list(row))
 }

 # @winston_chang
 # lapply over row indices
 f_lapply_row <- function(df) {
  lapply(seq_len(nrow(df)), function(i) as.list(df[i,,drop=FALSE]))
 }

 # @winston_chang
 # lapply + lapply: Treat data frame like a list, and the slice out lists
 f_lapply_lapply <- function(df) {
  cols <- seq_len(length(df))
  names(cols) <- names(df)

  lapply(seq_len(nrow(df)), function(row) {
    lapply(cols, function(col) {
      df[[col]][[row]]
    })
  })
 }

 # @winston_chang
 # lapply + lapply v2: Same as lapply_lapply, but explicitly convert df to a list
 f_lapply_lapply2 <- function(df) {
  rows <- seq_len(nrow(df))
  cols <- seq_len(length(df))
  names(cols) <- names(df)
  df <- as.list(df)

  lapply(rows, function(row) {
    lapply(cols, function(col) {
      df[[col]][[row]]
    })
  })
 }

 # @winston_chang
 # nested_for: Same as lapply_lapply2, but use a for loop instead of lapply()
 f_nested_for <- function(df) {
  nrows <- nrow(df)
  ncols <- length(df)
  row_idxs <- seq_len(nrows)
  col_idxs <- seq_len(ncols)
  colnames <- names(df)
  df <- as.list(df)
  
  res <- vector("list", nrows)
  
  for (i in row_idxs) {
    row <- vector("list", ncols)
    for (j in col_idxs) {
      row[[j]] <- df[[j]][[i]]
    }
    names(row) <- colnames
    res[[i]] <- row
  }

  res
 }

 # @ Tomasz Kalinowski
 # .mapply
 f_mapply <- function(df) {
  .mapply(list, unclass(df), NULL)
 }


 # @JennyBryan
 # purrr::pmap
 f_pmap <- function(df) {
  pmap(df, list)
 }

 # purrr::list_transpose
 f_list_transpose <- function(df) {
  list_transpose(as.list(df))
 }

 # purrr::transpose: This is superseded by list_transpose, but the old version is
 # much faster.
 f_transpose <- function(df) {
  transpose(as.list(df))
 }
 ```


 Benchmark each of them, using data sets with varying numbers of rows:

 ```{r}
 run_benchmark <- function(nrow) {
  # Make some data
  df <- data.frame(
    x = rnorm(nrow),
    y = runif(nrow),
    z = runif(nrow)
  )
  
  res <- list(
    apply          = system.time(f_apply(df)),
    split_lapply   = system.time(f_split_lapply(df)),
    lapply_row     = system.time(f_lapply_row(df)),
    lapply_lapply  = system.time(f_lapply_lapply(df)),
    lapply_lapply2 = system.time(f_lapply_lapply2(df)),
    nested_for     = system.time(f_nested_for(df)),
    mapply         = system.time(f_mapply(df)),
    pmap           = system.time(f_pmap(df)),
    list_transpose = system.time(f_list_transpose(df)),
    transpose      = system.time(f_transpose(df))
  )
  
  # Get elapsed times
  res <- lapply(res, `[[`, "elapsed")

  # Add nrow to front
  res <- c(nrow = nrow, res)
  res
 }

 # Run the benchmarks for various size data
 all_times <- lapply(1:5, function(n) {
  run_benchmark(10^n)
 })

 # Convert to data frame
 times <- lapply(all_times, as.data.frame)
 times <- do.call(rbind, times)

 knitr::kable(times)
 ```


 ## Plot times

 This plot shows the number of seconds needed to process n rows, for each method. Both the x and y use log scales, so each step along the x scale represents a 10x increase in number of rows, and each step along the y scale represents a 10x increase in time.

 ```{r message=FALSE}
 library(ggplot2)
 library(scales)
 library(ggrepel)

 # Convert to long format
 times_long <- gather(times, method, seconds, -nrow)

 # Set order of methods, for plots
 times_long$method <- factor(times_long$method,
  levels = c("apply", "split_lapply", "lapply_row",
    "lapply_lapply", "lapply_lapply2", "nested_for", "mapply", 
    "pmap", "list_transpose", "transpose")
 )

 # Set up a column for labels
 times_long$end_label <- sprintf("%s (%0.2fs)", times_long$method, times_long$seconds)
 times_long$end_label[times_long$nrow != max(times_long$nrow)] <- NA


 log10_breaks <- trans_breaks("log10", function(x) 10 ^ x)
 log10_mbreaks <- function(x) {
  limits <- c(floor(log10(x[1])), ceiling(log10(x[2])))
  breaks <- 10 ^ seq(limits[1], limits[2])
  unlist(lapply(breaks, function(x) x * seq(0.1, 0.9, by = 0.1)))
 }
 log10_labels <- trans_format("log10", math_format(10 ^ .x))

 # Plot with log-log axes
 ggplot(times_long, aes(x = nrow, y = seconds, colour = method)) +
  geom_point(size = 2) +
  geom_line(linewidth = 1) +
  geom_label_repel(aes(label = end_label), point.padding = 1,
                  direction = "y", nudge_x = 1.5) +
  annotation_logticks(sides = "trbl") +
  guides(colour = "none") +
  theme_bw() +
  scale_y_log10(
    breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
  ) +
  scale_x_log10(
    breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
  )
 ```
	---
	title: "Applying a function over rows of a data frame"
	author: "Winston Chang"
	output: html_document
	editor_options:
	chunk_output_type: console
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
	```

	[Source](https://gist.github.com/wch/0e564def155d976c04dd28a876dc04b4) for this document.

	@dattali [asked](https://twitter.com/daattali/status/761058049859518464), "what's a safe way to iterate over rows of a data frame?" The example was to convert each row into a list and return a list of lists, indexed first by column, then by row.

	A number of people gave suggestions on Twitter, which I've collected here. I've benchmarked these methods with data of various sizes; scroll down to see a plot of times.

	```{r message=FALSE}
	library(purrr)
	library(dplyr)
	library(tidyr)

	# @dattali
	# Using apply (only safe when all cols are same type)
	f_apply <- function(df) {
	apply(df, 1, function(row) as.list(row))
	}

	# @drob
	# split + lapply
	f_split_lapply <- function(df) {
	df <- split(df, seq_len(nrow(df)))
	lapply(df, function(row) as.list(row))
	}

	# @winston_chang
	# lapply over row indices
	f_lapply_row <- function(df) {
	lapply(seq_len(nrow(df)), function(i) as.list(df[i,,drop=FALSE]))
	}

	# @winston_chang
	# lapply + lapply: Treat data frame like a list, and the slice out lists
	f_lapply_lapply <- function(df) {
	cols <- seq_len(length(df))
	names(cols) <- names(df)

	lapply(seq_len(nrow(df)), function(row) {
	lapply(cols, function(col) {
	df[[col]][[row]]
	})
	})
	}

	# @winston_chang
	# lapply + lapply v2: Same as lapply_lapply, but explicitly convert df to a list
	f_lapply_lapply2 <- function(df) {
	rows <- seq_len(nrow(df))
	cols <- seq_len(length(df))
	names(cols) <- names(df)
	df <- as.list(df)

	lapply(rows, function(row) {
	lapply(cols, function(col) {
	df[[col]][[row]]
	})
	})
	}

	# @winston_chang
	# nested_for: Same as lapply_lapply2, but use a for loop instead of lapply()
	f_nested_for <- function(df) {
	nrows <- nrow(df)
	ncols <- length(df)
	row_idxs <- seq_len(nrows)
	col_idxs <- seq_len(ncols)
	colnames <- names(df)
	df <- as.list(df)

	res <- vector("list", nrows)

	for (i in row_idxs) {
	row <- vector("list", ncols)
	for (j in col_idxs) {
	row[[j]] <- df[[j]][[i]]
	}
	names(row) <- colnames
	res[[i]] <- row
	}

	res
	}

	# @ Tomasz Kalinowski
	# .mapply
	f_mapply <- function(df) {
	.mapply(list, unclass(df), NULL)
	}


	# @JennyBryan
	# purrr::pmap
	f_pmap <- function(df) {
	pmap(df, list)
	}

	# purrr::list_transpose
	f_list_transpose <- function(df) {
	list_transpose(as.list(df))
	}

	# purrr::transpose: This is superseded by list_transpose, but the old version is
	# much faster.
	f_transpose <- function(df) {
	transpose(as.list(df))
	}
	```


	Benchmark each of them, using data sets with varying numbers of rows:

	```{r}
	run_benchmark <- function(nrow) {
	# Make some data
	df <- data.frame(
	x = rnorm(nrow),
	y = runif(nrow),
	z = runif(nrow)
	)

	res <- list(
	apply = system.time(f_apply(df)),
	split_lapply = system.time(f_split_lapply(df)),
	lapply_row = system.time(f_lapply_row(df)),
	lapply_lapply = system.time(f_lapply_lapply(df)),
	lapply_lapply2 = system.time(f_lapply_lapply2(df)),
	nested_for = system.time(f_nested_for(df)),
	mapply = system.time(f_mapply(df)),
	pmap = system.time(f_pmap(df)),
	list_transpose = system.time(f_list_transpose(df)),
	transpose = system.time(f_transpose(df))
	)

	# Get elapsed times
	res <- lapply(res, `[[`, "elapsed")

	# Add nrow to front
	res <- c(nrow = nrow, res)
	res
	}

	# Run the benchmarks for various size data
	all_times <- lapply(1:5, function(n) {
	run_benchmark(10^n)
	})

	# Convert to data frame
	times <- lapply(all_times, as.data.frame)
	times <- do.call(rbind, times)

	knitr::kable(times)
	```


	## Plot times

	This plot shows the number of seconds needed to process n rows, for each method. Both the x and y use log scales, so each step along the x scale represents a 10x increase in number of rows, and each step along the y scale represents a 10x increase in time.

	```{r message=FALSE}
	library(ggplot2)
	library(scales)
	library(ggrepel)

	# Convert to long format
	times_long <- gather(times, method, seconds, -nrow)

	# Set order of methods, for plots
	times_long$method <- factor(times_long$method,
	levels = c("apply", "split_lapply", "lapply_row",
	"lapply_lapply", "lapply_lapply2", "nested_for", "mapply",
	"pmap", "list_transpose", "transpose")
	)

	# Set up a column for labels
	times_long$end_label <- sprintf("%s (%0.2fs)", times_long$method, times_long$seconds)
	times_long$end_label[times_long$nrow != max(times_long$nrow)] <- NA


	log10_breaks <- trans_breaks("log10", function(x) 10 ^ x)
	log10_mbreaks <- function(x) {
	limits <- c(floor(log10(x[1])), ceiling(log10(x[2])))
	breaks <- 10 ^ seq(limits[1], limits[2])
	unlist(lapply(breaks, function(x) x * seq(0.1, 0.9, by = 0.1)))
	}
	log10_labels <- trans_format("log10", math_format(10 ^ .x))

	# Plot with log-log axes
	ggplot(times_long, aes(x = nrow, y = seconds, colour = method)) +
	geom_point(size = 2) +
	geom_line(linewidth = 1) +
	geom_label_repel(aes(label = end_label), point.padding = 1,
	direction = "y", nudge_x = 1.5) +
	annotation_logticks(sides = "trbl") +
	guides(colour = "none") +
	theme_bw() +
	scale_y_log10(
	breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
	) +
	scale_x_log10(
	breaks = log10_breaks, labels = log10_labels, minor_breaks = log10_mbreaks
	)
	```