jeremy-allen · May 10, 2020 00:32 · jeremy-allen · May 10, 2020
diff --git a/find_index.R b/find_index.R
 library(data.table)
 library(dplyr)
 library(purrr)

 # lots of dates
 date = seq.Date(from = as.Date("1900-01-01"),
                to = as.Date("2900-12-31"),
                by = "day")

 # lots of cases
 cases = c(1:length(date)-1)

 # make a dataframe
 df <- data.frame(date = date, cases = cases)

 # Get the position of the first instance of 10,000 in the cases col,
 # and use that number to index the date col, returning the first date
 # on which 10,000 cases occurred.
 dt <- as.data.table(df) # convert to data.table first
 dt[, date[which.max(cases >= 10000)]]
 # this only works because 10000 is a vlaue that can be found in that column.

 # HOWEVER

 # which.max returns 1 when it fails, thus indexing
 # our first date, which we do not want because there
 # are no days with 400,000 or more cases. We expect NA.
 dt[, date[which.max(cases >= 400000)]]

 # Let's test many methods. Plus, we want to speed test them at the
 # end, so I'm putting each method inside a function because it's easier
 # to add them as functions in the speed test.

 dt_which_max_method <- function() {
  dt <- as.data.table(df)
  dt[, date[which.max(cases >= 400000)]]
 } 

 # match(true, x) will return NA when it fails, which
 # is what we want so that we don't get a date returned
 # when there are no days with 400,000 or more cases
 dt_match_true_method <- function() {
  dt <- as.data.table(df)
  dt[, date[match(TRUE, cases >= 100000)]]
 } 

 # test them all and return the first one, also returns NA
 dt_which_first_method <- function() {
  dt <- as.data.table(df)
  dt[, date[which(cases >= 400000)[1L]]]  
 }

 # use base R's Position function, also returns NA
 dt_position_method <- function() {
  dt <- as.data.table(df)
  dt[, date[Position(function(x) x >= 400000, cases)]]
 }

 #---- And now with a tibble and the purrr detect_index function

 # returns 'Date of length 0'
 tidyverse_method <- function() {
  tb <- tibble::as_tibble(df)
  tb %>%
    slice(purrr::detect_index(cases, ~.x >= 400000)) %>% 
    pull(date)
 }

 # tidyverse mixed with the base match function
 tidyverse_base_method <- function() {
  tb <- tibble::as_tibble(df)
  tb %>%
    slice(match(TRUE, cases >= 100000)) %>% 
    pull(date)
 }

 #--- Speed test them each 500 times

 microbenchmark::microbenchmark(
  dt_which_max_method(),
  dt_match_true_method(),
  dt_which_first_method(),
  dt_position_method(),
  tidyverse_method(),
  tidyverse_base_method(),
  times = 500L
 )
	library(data.table)
	library(dplyr)
	library(purrr)

	# lots of dates
	date = seq.Date(from = as.Date("1900-01-01"),
	to = as.Date("2900-12-31"),
	by = "day")

	# lots of cases
	cases = c(1:length(date)-1)

	# make a dataframe
	df <- data.frame(date = date, cases = cases)

	# Get the position of the first instance of 10,000 in the cases col,
	# and use that number to index the date col, returning the first date
	# on which 10,000 cases occurred.
	dt <- as.data.table(df) # convert to data.table first
	dt[, date[which.max(cases >= 10000)]]
	# this only works because 10000 is a vlaue that can be found in that column.

	# HOWEVER

	# which.max returns 1 when it fails, thus indexing
	# our first date, which we do not want because there
	# are no days with 400,000 or more cases. We expect NA.
	dt[, date[which.max(cases >= 400000)]]

	# Let's test many methods. Plus, we want to speed test them at the
	# end, so I'm putting each method inside a function because it's easier
	# to add them as functions in the speed test.

	dt_which_max_method <- function() {
	dt <- as.data.table(df)
	dt[, date[which.max(cases >= 400000)]]
	}

	# match(true, x) will return NA when it fails, which
	# is what we want so that we don't get a date returned
	# when there are no days with 400,000 or more cases
	dt_match_true_method <- function() {
	dt <- as.data.table(df)
	dt[, date[match(TRUE, cases >= 100000)]]
	}

	# test them all and return the first one, also returns NA
	dt_which_first_method <- function() {
	dt <- as.data.table(df)
	dt[, date[which(cases >= 400000)[1L]]]
	}

	# use base R's Position function, also returns NA
	dt_position_method <- function() {
	dt <- as.data.table(df)
	dt[, date[Position(function(x) x >= 400000, cases)]]
	}

	#---- And now with a tibble and the purrr detect_index function

	# returns 'Date of length 0'
	tidyverse_method <- function() {
	tb <- tibble::as_tibble(df)
	tb %>%
	slice(purrr::detect_index(cases, ~.x >= 400000)) %>%
	pull(date)
	}

	# tidyverse mixed with the base match function
	tidyverse_base_method <- function() {
	tb <- tibble::as_tibble(df)
	tb %>%
	slice(match(TRUE, cases >= 100000)) %>%
	pull(date)
	}

	#--- Speed test them each 500 times

	microbenchmark::microbenchmark(
	dt_which_max_method(),
	dt_match_true_method(),
	dt_which_first_method(),
	dt_position_method(),
	tidyverse_method(),
	tidyverse_base_method(),
	times = 500L
	)