Last active
May 10, 2020 00:32
-
-
Save jeremy-allen/fbc87bed7931dc095850e10ef739b7db to your computer and use it in GitHub Desktop.
Get the position number of the first instance of a thing in one column, and use that number to pick something from another column
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(dplyr) | |
library(purrr) | |
# lots of dates | |
date = seq.Date(from = as.Date("1900-01-01"), | |
to = as.Date("2900-12-31"), | |
by = "day") | |
# lots of cases | |
cases = c(1:length(date)-1) | |
# make a dataframe | |
df <- data.frame(date = date, cases = cases) | |
# Get the position of the first instance of 10,000 in the cases col, | |
# and use that number to index the date col, returning the first date | |
# on which 10,000 cases occurred. | |
dt <- as.data.table(df) # convert to data.table first | |
dt[, date[which.max(cases >= 10000)]] | |
# this only works because 10000 is a vlaue that can be found in that column. | |
# HOWEVER | |
# which.max returns 1 when it fails, thus indexing | |
# our first date, which we do not want because there | |
# are no days with 400,000 or more cases. We expect NA. | |
dt[, date[which.max(cases >= 400000)]] | |
# Let's test many methods. Plus, we want to speed test them at the | |
# end, so I'm putting each method inside a function because it's easier | |
# to add them as functions in the speed test. | |
dt_which_max_method <- function() { | |
dt <- as.data.table(df) | |
dt[, date[which.max(cases >= 400000)]] | |
} | |
# match(true, x) will return NA when it fails, which | |
# is what we want so that we don't get a date returned | |
# when there are no days with 400,000 or more cases | |
dt_match_true_method <- function() { | |
dt <- as.data.table(df) | |
dt[, date[match(TRUE, cases >= 100000)]] | |
} | |
# test them all and return the first one, also returns NA | |
dt_which_first_method <- function() { | |
dt <- as.data.table(df) | |
dt[, date[which(cases >= 400000)[1L]]] | |
} | |
# use base R's Position function, also returns NA | |
dt_position_method <- function() { | |
dt <- as.data.table(df) | |
dt[, date[Position(function(x) x >= 400000, cases)]] | |
} | |
#---- And now with a tibble and the purrr detect_index function | |
# returns 'Date of length 0' | |
tidyverse_method <- function() { | |
tb <- tibble::as_tibble(df) | |
tb %>% | |
slice(purrr::detect_index(cases, ~.x >= 400000)) %>% | |
pull(date) | |
} | |
# tidyverse mixed with the base match function | |
tidyverse_base_method <- function() { | |
tb <- tibble::as_tibble(df) | |
tb %>% | |
slice(match(TRUE, cases >= 100000)) %>% | |
pull(date) | |
} | |
#--- Speed test them each 500 times | |
microbenchmark::microbenchmark( | |
dt_which_max_method(), | |
dt_match_true_method(), | |
dt_which_first_method(), | |
dt_position_method(), | |
tidyverse_method(), | |
tidyverse_base_method(), | |
times = 500L | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment

Important clarification from Hadley on the use of purrr::detect_index()