Created
May 19, 2017 12:48
-
-
Save lukegre/82739f81764f0bef393345ef0c92eee7 to your computer and use it in GitHub Desktop.
Fill gaps in a time series using Gaussian Processes with the option of adding realistically scaled noise
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
A short script that is used to fill gaps in a time series (gap_filler_with_noise). | |
The method uses gaussian processes (aka Kriging) to get the trend (calls gaussian_smoother). | |
I recommend that you play around with the theta0 value to find the correct scale for the trend. | |
Noise is added to the estimated trend by assessing the noise around the trend. | |
Diagnostic plots can be created with this script. | |
""" | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from sklearn import gaussian_process as gp | |
from scipy.stats import norm | |
import pandas as pd | |
def main(): | |
pass | |
def gaussian_smoother(arr, theta0=2e2, nugget=0.002, verbose=False): | |
""" | |
Finds a smoother that uses Gaussian Processes to estimate | |
the curve. You may need to adjust the nugget and theta0 to | |
fit the data to your liking. | |
INPUT: numpy array that may contain nans | |
[theta0] is the "smoothing factor". Making this number | |
smaller will make it smoother. | |
More specifically theta0 determines the size | |
of the Gaussian in the covariance matrix. | |
[nugget] is the noise to add if exact interpolation | |
OUTPUT: The smoothed array with filled gaps | |
""" | |
nans = np.isnan(arr) | |
x = np.arange(arr.size)[:, np.newaxis] | |
x_train = x[~nans] | |
y_train = arr[~nans] | |
estim = gp.GaussianProcess(nugget=nugget, theta0=theta0) | |
estim.fit(x_train, y_train) | |
x_hat = x | |
y_hat = estim.predict(x_hat) | |
if verbose: | |
plt.figure(figsize=[10, 6]) | |
plt.plot(arr, c='k', lw=1, alpha=0.6, label='input') | |
plt.plot(y_hat, c='b', lw=1, label='smoothed') | |
plt.legend(loc=0) | |
plt.show() | |
return y_hat | |
def fill_gap_with_noise(arr, theta0=1e2, verbose=True): | |
nans = np.isnan(arr) | |
y_hat = gaussian_smoother(arr, theta0) | |
y_dif = arr - y_hat | |
y_lims = np.percentile(y_dif[~nans], [5, 95]) | |
y_tmp = y_dif[~nans] | |
i = (y_tmp < y_lims[0]) | (y_tmp > y_lims[1]) | |
y_tmp[i] = np.nan | |
y_tmp = y_tmp[~np.isnan(y_tmp)] | |
a, b = norm.fit(y_tmp) | |
noise = np.random.normal(a, b, y_hat.size) | |
y_modelled_with_noise = y_hat + noise | |
y_filled = arr.copy() | |
y_filled[nans] = y_modelled_with_noise[nans] | |
if verbose: | |
plt.figure(figsize=[12, 12]) | |
ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=2) | |
ax2 = plt.subplot2grid((2, 2), (1, 0)) | |
ax3 = plt.subplot2grid((2, 2), (1, 1)) | |
ax1.set_title('Time series overlaid with smoothed ' | |
'function and noise added in the gaps') | |
x = np.arange(arr.size) | |
y = y_filled | |
y[~nans] = np.nan | |
ax1.plot(x, arr, c='k', lw=2, alpha=0.4, label='input') | |
ax1.plot(x, y_hat, c='k', lw=0.7, label='smoothed') | |
ax1.plot(x, y, c='r', lw=1.5, label='filled noise') | |
ax1.set_xlim(0, arr.size) | |
ax1.legend(loc=0) | |
ax2.set_title('Difference between smoothed and input') | |
ax2.axhline(0, color='k') | |
ax2.plot(y_dif, 'ok', ms=2, alpha=0.5) | |
ax2.set_xlim(0, arr.size) | |
ax3.set_title('Difference plotted as histogram fitted with Gaussian') | |
y, x = np.histogram(y_dif[~nans], bins=50) | |
x = np.c_[x[:-1], x[1:]].mean(1) | |
ax3.bar(x, y, color='gray', lw=0, alpha=0.6, label='histogram') | |
ax3 = ax3.twinx() | |
xmin, xmax = ax3.get_xlim() | |
x = np.linspace(xmin, xmax, 250) | |
y = norm.pdf(x, a, b) | |
ax3.plot(x, y, lw=1.5, c='r', label='Gaussian PDF') | |
ax3.legend() | |
plt.tight_layout() | |
plt.show() | |
return y_filled | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment