Skip to content

Instantly share code, notes, and snippets.

@AndreyAkinshin
Created October 5, 2019 11:37
Show Gist options
  • Save AndreyAkinshin/17c2211a8f0eee090670b80221f11761 to your computer and use it in GitHub Desktop.
Save AndreyAkinshin/17c2211a8f0eee090670b80221f11761 to your computer and use it in GitHub Desktop.
ED-PELT algorithm
// Copyright (c) 2019 Andrey Akinshin
// Licensed under The MIT License https://opensource.org/licenses/MIT
using System;
using System.Collections.Generic;
using System.Linq;
/// <summary>
/// The ED-PELT algorithm for changepoint detection.
///
/// <remarks>
/// The implementation is based on the following papers:
/// <list type="bullet">
/// <item>
/// <b>[Haynes2017]</b> Haynes, Kaylea, Paul Fearnhead, and Idris A. Eckley.
/// "A computationally efficient nonparametric approach for changepoint detection."
/// Statistics and Computing 27, no. 5 (2017): 1293-1305.
/// https://doi.org/10.1007/s11222-016-9687-5
/// </item>
/// <item>
/// <b>[Killick2012]</b> Killick, Rebecca, Paul Fearnhead, and Idris A. Eckley.
/// "Optimal detection of changepoints with a linear computational cost."
/// Journal of the American Statistical Association 107, no. 500 (2012): 1590-1598.
/// https://arxiv.org/pdf/1101.1438.pdf
/// </item>
/// </list>
/// </remarks>
/// </summary>
public class EdPeltChangePointDetector
{
public static readonly EdPeltChangePointDetector Instance = new EdPeltChangePointDetector();
/// <summary>
/// For given array of `double` values, detects locations of changepoints that
/// splits original series of values into "statistically homogeneous" segments.
/// Such points correspond to moments when statistical properties of the distribution are changing.
///
/// This method supports nonparametric distributions and has O(N*log(N)) algorithmic complexity.
/// </summary>
/// <param name="data">An array of double values</param>
/// <param name="minDistance">Minimum distance between changepoints</param>
/// <returns>
/// Returns an `int[]` array with 0-based indexes of changepoint.
/// Changepoints correspond to the end of the detected segments.
/// For example, changepoints for { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 } are { 5, 11 }.
/// </returns>
public int[] GetChangePointIndexes(double[] data, int minDistance = 1)
{
// We will use `n` as the number of elements in the `data` array
int n = data.Length;
// Checking corner cases
if (n <= 2)
return new int[0];
if (minDistance < 1 || minDistance > n)
throw new ArgumentOutOfRangeException(
nameof(minDistance),$"{minDistance} should be in range from 1 to data.Length");
// The penalty which we add to the final cost for each additional changepoint
// Here we use the Modified Bayesian Information Criterion
double penalty = 3 * Math.Log(n);
// `k` is the number of quantiles that we use to approximate an integral during the segment cost evaluation
// We use `k=Ceiling(4*log(n))` as suggested in the Section 4.3 "Choice of K in ED-PELT" in [Haynes2017]
// `k` can't be greater than `n`, so we should always use the `Min` function here (important for n <= 8)
int k = Math.Min(n, (int) Math.Ceiling(4 * Math.Log(n)));
// We should precalculate sums for empirical CDF, it will allow fast evaluating of the segment cost
var partialSums = GetPartialSums(data, k);
// Since we use the same values of `partialSums`, `k`, `n` all the time,
// we introduce a shortcut `Cost(tau1, tau2)` for segment cost evaluation.
// Hereinafter, we use `tau` to name variables that are changepoint candidates.
double Cost(int tau1, int tau2) => GetSegmentCost(partialSums, tau1, tau2, k, n);
// We will use dynamic programming to find the best solution; `bestCost` is the cost array.
// `bestCost[i]` is the cost for subarray `data[0..i-1]`.
// It's a 1-based array (`data[0]`..`data[n-1]` correspond to `bestCost[1]`..`bestCost[n]`)
var bestCost = new double[n + 1];
bestCost[0] = -penalty;
for (int currentTau = minDistance; currentTau < 2 * minDistance; currentTau++)
bestCost[currentTau] = Cost(0, currentTau);
// `previousChangePointIndex` is an array of references to previous changepoints. If the current segment ends at
// the position `i`, the previous segment ends at the position `previousChangePointIndex[i]`. It's a 1-based
// array (`data[0]`..`data[n-1]` correspond to the `previousChangePointIndex[1]`..`previousChangePointIndex[n]`)
var previousChangePointIndex = new int[n + 1];
// We use PELT (Pruned Exact Linear Time) approach which means that instead of enumerating all possible previous
// tau values, we use a whitelist of "good" tau values that can be used in the optimal solution. If we are 100%
// sure that some of the tau values will not help us to form the optimal solution, such values should be
// removed. See [Killick2012] for details.
var previousTaus = new List<int>(n + 1) { 0, minDistance };
var costForPreviousTau = new List<double>(n + 1);
// Following the dynamic programming approach, we enumerate all tau positions. For each `currentTau`, we pretend
// that it's the end of the last segment and trying to find the end of the previous segment.
for (int currentTau = 2 * minDistance; currentTau < n + 1; currentTau++)
{
// For each previous tau, we should calculate the cost of taking this tau as the end of the previous
// segment. This cost equals the cost for the `previousTau` plus cost of the new segment (from `previousTau`
// to `currentTau`) plus penalty for the new changepoint.
costForPreviousTau.Clear();
foreach (int previousTau in previousTaus)
costForPreviousTau.Add(bestCost[previousTau] + Cost(previousTau, currentTau) + penalty);
// Now we should choose the tau that provides the minimum possible cost.
int bestPreviousTauIndex = WhichMin(costForPreviousTau);
bestCost[currentTau] = costForPreviousTau[bestPreviousTauIndex];
previousChangePointIndex[currentTau] = previousTaus[bestPreviousTauIndex];
// Prune phase: we remove "useless" tau values that will not help to achieve minimum cost in the future
double currentBestCost = bestCost[currentTau];
int newPreviousTausSize = 0;
for (int i = 0; i < previousTaus.Count; i++)
if (costForPreviousTau[i] < currentBestCost + penalty)
previousTaus[newPreviousTausSize++] = previousTaus[i];
previousTaus.RemoveRange(newPreviousTausSize, previousTaus.Count - newPreviousTausSize);
// We add a new tau value that is located on the `minDistance` distance from the next `currentTau` value
previousTaus.Add(currentTau - (minDistance - 1));
}
// Here we collect the result list of changepoint indexes `changePointIndexes` using `previousChangePointIndex`
var changePointIndexes = new List<int>();
int currentIndex = previousChangePointIndex[n]; // The index of the end of the last segment is `n`
while (currentIndex != 0)
{
changePointIndexes.Add(currentIndex - 1); // 1-based indexes should be be transformed to 0-based indexes
currentIndex = previousChangePointIndex[currentIndex];
}
changePointIndexes.Reverse(); // The result changepoints should be sorted in ascending order.
return changePointIndexes.ToArray();
}
/// <summary>
/// Partial sums for empirical CDF (formula (2.1) from Section 2.1 "Model" in [Haynes2017])
/// <code>
/// partialSums[i, tau] = (count(data[j] &lt; t) * 2 + count(data[j] == t) * 1) for j=0..tau-1
/// where t is the i-th quantile value (see Section 3.1 "Discrete approximation" in [Haynes2017] for details)
/// </code>
/// <remarks>
/// <list type="bullet">
/// <item>
/// We use doubled sum values in order to use <c>int[,]</c> instead of <c>double[,]</c> (it provides noticeable
/// performance boost). Thus, multipliers for <c>count(data[j] &lt; t)</c> and <c>count(data[j] == t)</c> are
/// 2 and 1 instead of 1 and 0.5 from the [Haynes2017].
/// </item>
/// <item>
/// Note that these quantiles are not uniformly distributed: tails of the <c>data</c> distribution contain more
/// quantile values than the center of the distribution
/// </item>
/// </list>
/// </remarks>
/// </summary>
private static int[,] GetPartialSums(double[] data, int k)
{
int n = data.Length;
var partialSums = new int[k, n + 1];
var sortedData = data.OrderBy(it => it).ToArray();
for (int i = 0; i < k; i++)
{
double z = -1 + (2 * i + 1.0) / k; // Values from (-1+1/k) to (1-1/k) with step = 2/k
double p = 1.0 / (1 + Math.Pow(2 * n - 1, -z)); // Values from 0.0 to 1.0
double t = sortedData[(int) Math.Truncate((n - 1) * p)]; // Quantile value, formula (2.1) in [Haynes2017]
for (int tau = 1; tau <= n; tau++)
{
partialSums[i, tau] = partialSums[i, tau - 1];
if (data[tau - 1] < t)
partialSums[i, tau] += 2; // We use doubled value (2) instead of original 1.0
if (data[tau - 1] == t)
partialSums[i, tau] += 1; // We use doubled value (1) instead of original 0.5
}
}
return partialSums;
}
/// <summary>
/// Calculates the cost of the (tau1; tau2] segment.
/// </summary>
private static double GetSegmentCost(int[,] partialSums, int tau1, int tau2, int k, int n)
{
double sum = 0;
for (int i = 0; i < k; i++)
{
// actualSum is (count(data[j] < t) * 2 + count(data[j] == t) * 1) for j=tau1..tau2-1
int actualSum = partialSums[i, tau2] - partialSums[i, tau1];
// We skip these two cases (correspond to fit = 0 or fit = 1) because of invalid Math.Log values
if (actualSum != 0 && actualSum != (tau2 - tau1) * 2)
{
// Empirical CDF $\hat{F}_i(t)$ (Section 2.1 "Model" in [Haynes2017])
double fit = actualSum * 0.5 / (tau2 - tau1);
// Segment cost $\mathcal{L}_{np}$ (Section 2.2 "Nonparametric maximum likelihood" in [Haynes2017])
double lnp = (tau2 - tau1) * (fit * Math.Log(fit) + (1 - fit) * Math.Log(1 - fit));
sum += lnp;
}
}
double c = -Math.Log(2 * n - 1); // Constant from Lemma 3.1 in [Haynes2017]
return 2.0 * c / k * sum; // See Section 3.1 "Discrete approximation" in [Haynes2017]
}
/// <summary>
/// Returns the index of the minimum element.
/// In case if there are several minimum elements in the given list, the index of the first one will be returned.
/// </summary>
private static int WhichMin(IList<double> values)
{
if (values.Count == 0)
throw new InvalidOperationException("Array should contain elements");
double minValue = values[0];
int minIndex = 0;
for (int i = 1; i < values.Count; i++)
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
return minIndex;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment