library(data.table)
merge_overlaps <- function(tbl, start, end, max_gap = 0) {
grps <- dplyr::group_vars(tbl)
s <- rlang::enexpr(start)
e <- rlang::enexpr(end)
s_nm <- rlang::as_label(s)
e_nm <- rlang::as_label(e)
# make a copy to not pollute tbl in the calling env
tbl <- as.data.table(tbl)
setkeyv(tbl, c(grps, s_nm, e_nm))
tbl[, pe := shift(eval(e), fill = eval(s)[1L]) + max_gap, keyby = grps]
tbl[, g := 1L + cumsum(eval(s) > cummax(as.numeric(pe))), keyby = grps]
res <- tbl[, .(s = min(eval(s)), e = max(eval(e))), by = c(grps, "g")]
setnames(res, c("g", "s", "e"), c("seq", s_nm, e_nm))
dplyr::group_by_at(setDF(res), grps)
}
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- tribble(
~id, ~start, ~end,
1, "2019-01-01", "2019-02-01",
1, "2019-01-31", "2019-02-28",
1, "2019-03-01", "2019-03-31",
2, "2019-01-10", "2019-01-20",
2, "2019-01-01", "2019-02-01"
) %>% mutate_if(is.character, lubridate::as_date)
df %>%
merge_overlaps(start, end)
#> # A tibble: 2 x 3
#> seq start end
#> <int> <date> <date>
#> 1 1 2019-01-01 2019-02-28
#> 2 2 2019-03-01 2019-03-31
df %>%
group_by(id) %>%
merge_overlaps(start, end)
#> # A tibble: 3 x 4
#> # Groups: id [2]
#> id seq start end
#> <dbl> <int> <date> <date>
#> 1 1 1 2019-01-01 2019-02-28
#> 2 1 2 2019-03-01 2019-03-31
#> 3 2 1 2019-01-01 2019-02-01
df %>%
group_by(id) %>%
merge_overlaps(start, end, max_gap = 1)
#> # A tibble: 2 x 4
#> # Groups: id [2]
#> id seq start end
#> <dbl> <int> <date> <date>
#> 1 1 1 2019-01-01 2019-03-31
#> 2 2 1 2019-01-01 2019-02-01Created on 2019-02-05 by the reprex package (v0.2.1.9000)