Introduction

In this gist, we replicate two simulations by (Liu et al. 2013), and show how the R-package densityratio can be used for change-point detection in time-series data. To do so, we first load the required packages, and set the future environment to do parallel processing.

library(furrr)
#> Loading required package: future
library(densityratio)
plan(multisession(workers = 18))

Time-series data generation

We generate $T = 4999$ samples according to the following model $$y(t) = 0.6(t-1) - 0.5y(t-2) + \epsilon_t.$$ In the first simulation, the residuals are distributed as a normal $\mathcal{N}(\mu_t, \sigma = 1.5)$ distribution, where $\mu_t$ reflects the changepoints, and is specified as

such that the changepoints become more pronounced as time progresses. In the second simulation, the residuals are distributed as a normal $\mathcal{N}(\mu = 0, \sigma_t)$ distribution, where the residual standard deviation is varied as

such that again the changepoints become more pronounced as time progresses.

set.seed(1)

y1 <- y2 <- numeric(4999)
mu <- numeric(50)

for (n in 2:50) mu[n] <- mu[n-1] + n/16

for (t in 3:4999) {
  mu_t <- mu[t%/%100 + 1]
  y1[t] <- 0.6 * y1[t-1] - 0.5 * y1[t-2] + rnorm(1, mu_t, 1.5)
}

sigma <- numeric(50)

for (n in 1:50) {
  if (n %% 2 == 0) {
    sigma[n] <- log(exp(1) + n/4)
  } else {
    sigma[n] <- 1
  }
}

for (t in 3:4999) {
  sigma_t <- sigma[t%/%100 + 1]  
  y2[t] <- 0.6 * y2[t-1] - 0.5 * y2[t-2] + rnorm(1, 0, sigma_t)
}

plot(4000:4999, y1[4000:4999], type = "l", col = "blue")
abline(v = c(40:50*100))

plot(4000:4999, y2[4000:4999], type = "l", col = "blue")
abline(v = c(40:50*100))

If we plot the distribution of the data sets, we can indeed see a shift in the distribution at the specified timepoints.

Estimating the change-point using `ulsif()`

The next step is to estimate the density ratio between all neighbouring data sets. That is, we estimated the density ratio between the points $(y_{t-n+1}, y_{t-n+2}, \dots, y_t)$ and the points $(y_{t+1}, y_{t+2}, \dots, y_{t+n})$, such that we compare two data sets of size $n$. Moreover, we consider $k=5$ lags, such that for timepoint $t$, we also consider the values of $y$ at the timepoints $t-1, t-1, \dots, t - k$.

k <- 5
n <- 50

Y1t <- purrr::map(
  1:length(y1),
  ~y1[.x:min(.x+k - 1, length(y1))]
)

curly_Y1t <- purrr::map(
  3500:4499,
  function(t) {
    dat1 <- Y1t[(t-n):(t-1)]
    dat2 <- Y1t[(t):(t+n-1)]
    names(dat1) <- names(dat2) <- paste0("X", 1:n)
    list(data.frame(dat1), data.frame(dat2))
  }
)

Y2t <- purrr::map(
  1:length(y2),
  ~y2[max(1, .x-k):.x]
)

curly_Y2t <- purrr::map(
  3500:4499,
  function(t) {
    dat1 <- Y2t[(t-n+1):t]
    dat2 <- Y2t[(t+1):(t+n)]
    names(dat1) <- names(dat2) <- paste0("X", 1:n)
    list(data.frame(dat1), data.frame(dat2))
  }
)



fits1 <- furrr::future_map_dbl(
  1:(length(curly_Y1t) - 1), 
  function(t) {
    dat <- unlist(curly_Y1t[t], recursive = FALSE)
    dat1 <- t(dat[[1]])
    dat2 <- t(dat[[2]])
    fit <- ulsif(
      dat1,
      dat2,
      sigma_quantile = c(0.3, 0.4, 0.5, 0.6, 0.7),
      lambda = 10^{-3:1},
      ncenters = 50
    )
    fit_rec <- ulsif(
      dat2,
      dat1,
      sigma_quantile = c(0.3, 0.4, 0.5, 0.6, 0.7),
      lambda = 10^{-3:1},
      ncenters = 50
    )
    PE <- summary(fit)$PE + summary(fit_rec)$PE
    PE
  }, .progress = TRUE, .options = furrr_options(seed = TRUE))

fits2 <- furrr::future_map_dbl(
  1:(length(curly_Y2t) - 1), 
  function(t) {
    dat <- unlist(curly_Y2t[t], recursive = FALSE)
    dat1 <- t(dat[[1]])
    dat2 <- t(dat[[2]])
    fit <- ulsif(
      dat1,
      dat2,
      sigma_quantile = c(0.3, 0.4, 0.5, 0.6, 0.7),
      lambda = 10^{-3:1},
      ncenters = 50
    )
    fit_rec <- ulsif(
      dat2,
      dat1,
      sigma_quantile = c(0.3, 0.4, 0.5, 0.6, 0.7),
      lambda = 10^{-3:1},
      ncenters = 50
    )
    PE <- summary(fit)$PE + summary(fit_rec)$PE
    PE
  }, .progress = TRUE, .options = furrr_options(seed = TRUE))

plot(3501:4499, fits1)
abline(v = c(36:44*100))

plot(3501:4499, fits2)
abline(v = c(36:44*100))

^{Created on 2023-10-30 with reprex v2.0.2}

References

Liu, Song, Makoto Yamada, Nigel Collier, and Masashi Sugiyama. 2013. “Change-Point Detection in Time-Series Data by Relative Density-Ratio Estimation.” Neural Networks 43 (January): 72–83. https://doi.org/10.1016/j.neunet.2013.01.012.

thomvolker/cpd_ulsif.md

Introduction

Time-series data generation

Estimating the change-point using ulsif()

References

Estimating the change-point using `ulsif()`