Skip to content

Instantly share code, notes, and snippets.

@mikmart
Last active February 5, 2019 10:19
Show Gist options
  • Select an option

  • Save mikmart/0ffa090bd2a6c137fc04a53e9d7f6d95 to your computer and use it in GitHub Desktop.

Select an option

Save mikmart/0ffa090bd2a6c137fc04a53e9d7f6d95 to your computer and use it in GitHub Desktop.
57th take at merging overlapping intervals in a data frame
library(data.table)

merge_overlaps <- function(tbl, start, end, max_gap = 0) {
  grps <- dplyr::group_vars(tbl)
  
  s <- rlang::enexpr(start)
  e <- rlang::enexpr(end)
  
  s_nm <- rlang::as_label(s)
  e_nm <- rlang::as_label(e)
  
  # make a copy to not pollute tbl in the calling env
  tbl <- as.data.table(tbl)
  setkeyv(tbl, c(grps, s_nm, e_nm))
  
  tbl[, pe := shift(eval(e), fill = eval(s)[1L]) + max_gap, keyby = grps]
  tbl[, g := 1L + cumsum(eval(s) > cummax(as.numeric(pe))), keyby = grps]
  res <- tbl[, .(s = min(eval(s)), e = max(eval(e))), by = c(grps, "g")]
  
  setnames(res, c("g", "s", "e"), c("seq", s_nm, e_nm))
  dplyr::group_by_at(setDF(res), grps)
}

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#> 
#>     between, first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df <- tribble(
  ~id, ~start,       ~end,
    1, "2019-01-01", "2019-02-01",
    1, "2019-01-31", "2019-02-28",
    1, "2019-03-01", "2019-03-31",
    2, "2019-01-10", "2019-01-20",
    2, "2019-01-01", "2019-02-01"
) %>% mutate_if(is.character, lubridate::as_date)

df %>% 
  merge_overlaps(start, end)
#> # A tibble: 2 x 3
#>     seq start      end       
#>   <int> <date>     <date>    
#> 1     1 2019-01-01 2019-02-28
#> 2     2 2019-03-01 2019-03-31

df %>% 
  group_by(id) %>% 
  merge_overlaps(start, end)
#> # A tibble: 3 x 4
#> # Groups:   id [2]
#>      id   seq start      end       
#>   <dbl> <int> <date>     <date>    
#> 1     1     1 2019-01-01 2019-02-28
#> 2     1     2 2019-03-01 2019-03-31
#> 3     2     1 2019-01-01 2019-02-01

df %>% 
  group_by(id) %>% 
  merge_overlaps(start, end, max_gap = 1)
#> # A tibble: 2 x 4
#> # Groups:   id [2]
#>      id   seq start      end       
#>   <dbl> <int> <date>     <date>    
#> 1     1     1 2019-01-01 2019-03-31
#> 2     2     1 2019-01-01 2019-02-01

Created on 2019-02-05 by the reprex package (v0.2.1.9000)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment