Skip to content

Instantly share code, notes, and snippets.

@lukegre
Created May 19, 2017 12:48
Show Gist options
  • Save lukegre/82739f81764f0bef393345ef0c92eee7 to your computer and use it in GitHub Desktop.
Save lukegre/82739f81764f0bef393345ef0c92eee7 to your computer and use it in GitHub Desktop.
Fill gaps in a time series using Gaussian Processes with the option of adding realistically scaled noise
#!/usr/bin/env python
"""
A short script that is used to fill gaps in a time series (gap_filler_with_noise).
The method uses gaussian processes (aka Kriging) to get the trend (calls gaussian_smoother).
I recommend that you play around with the theta0 value to find the correct scale for the trend.
Noise is added to the estimated trend by assessing the noise around the trend.
Diagnostic plots can be created with this script.
"""
import numpy as np
from matplotlib import pyplot as plt
from sklearn import gaussian_process as gp
from scipy.stats import norm
import pandas as pd
def main():
pass
def gaussian_smoother(arr, theta0=2e2, nugget=0.002, verbose=False):
"""
Finds a smoother that uses Gaussian Processes to estimate
the curve. You may need to adjust the nugget and theta0 to
fit the data to your liking.
INPUT: numpy array that may contain nans
[theta0] is the "smoothing factor". Making this number
smaller will make it smoother.
More specifically theta0 determines the size
of the Gaussian in the covariance matrix.
[nugget] is the noise to add if exact interpolation
OUTPUT: The smoothed array with filled gaps
"""
nans = np.isnan(arr)
x = np.arange(arr.size)[:, np.newaxis]
x_train = x[~nans]
y_train = arr[~nans]
estim = gp.GaussianProcess(nugget=nugget, theta0=theta0)
estim.fit(x_train, y_train)
x_hat = x
y_hat = estim.predict(x_hat)
if verbose:
plt.figure(figsize=[10, 6])
plt.plot(arr, c='k', lw=1, alpha=0.6, label='input')
plt.plot(y_hat, c='b', lw=1, label='smoothed')
plt.legend(loc=0)
plt.show()
return y_hat
def fill_gap_with_noise(arr, theta0=1e2, verbose=True):
nans = np.isnan(arr)
y_hat = gaussian_smoother(arr, theta0)
y_dif = arr - y_hat
y_lims = np.percentile(y_dif[~nans], [5, 95])
y_tmp = y_dif[~nans]
i = (y_tmp < y_lims[0]) | (y_tmp > y_lims[1])
y_tmp[i] = np.nan
y_tmp = y_tmp[~np.isnan(y_tmp)]
a, b = norm.fit(y_tmp)
noise = np.random.normal(a, b, y_hat.size)
y_modelled_with_noise = y_hat + noise
y_filled = arr.copy()
y_filled[nans] = y_modelled_with_noise[nans]
if verbose:
plt.figure(figsize=[12, 12])
ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=2)
ax2 = plt.subplot2grid((2, 2), (1, 0))
ax3 = plt.subplot2grid((2, 2), (1, 1))
ax1.set_title('Time series overlaid with smoothed '
'function and noise added in the gaps')
x = np.arange(arr.size)
y = y_filled
y[~nans] = np.nan
ax1.plot(x, arr, c='k', lw=2, alpha=0.4, label='input')
ax1.plot(x, y_hat, c='k', lw=0.7, label='smoothed')
ax1.plot(x, y, c='r', lw=1.5, label='filled noise')
ax1.set_xlim(0, arr.size)
ax1.legend(loc=0)
ax2.set_title('Difference between smoothed and input')
ax2.axhline(0, color='k')
ax2.plot(y_dif, 'ok', ms=2, alpha=0.5)
ax2.set_xlim(0, arr.size)
ax3.set_title('Difference plotted as histogram fitted with Gaussian')
y, x = np.histogram(y_dif[~nans], bins=50)
x = np.c_[x[:-1], x[1:]].mean(1)
ax3.bar(x, y, color='gray', lw=0, alpha=0.6, label='histogram')
ax3 = ax3.twinx()
xmin, xmax = ax3.get_xlim()
x = np.linspace(xmin, xmax, 250)
y = norm.pdf(x, a, b)
ax3.plot(x, y, lw=1.5, c='r', label='Gaussian PDF')
ax3.legend()
plt.tight_layout()
plt.show()
return y_filled
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment