lukegre · May 19, 2017 12:48
diff --git a/time_series_GP_gap_filler.py b/time_series_GP_gap_filler.py
 #!/usr/bin/env python

 """
 A short script that is used to fill gaps in a time series (gap_filler_with_noise). 
 The method uses gaussian processes (aka Kriging) to get the trend (calls gaussian_smoother). 
 I recommend that you play around with the theta0 value to find the correct scale for the trend. 

 Noise is added to the estimated trend by assessing the noise around the trend. 

 Diagnostic plots can be created with this script. 
 """

 import numpy as np
 from matplotlib import pyplot as plt
 from sklearn import gaussian_process as gp
 from scipy.stats import norm
 import pandas as pd


 def main():
  pass


 def gaussian_smoother(arr, theta0=2e2, nugget=0.002, verbose=False):
    """
    Finds a smoother that uses Gaussian Processes to estimate
    the curve. You may need to adjust the nugget and theta0 to
    fit the data to your liking.

    INPUT:  numpy array that may contain nans
            [theta0] is the "smoothing factor". Making this number
                     smaller will make it smoother.
                     More specifically theta0 determines the size
                     of the Gaussian in the covariance matrix.
            [nugget] is the noise to add if exact interpolation
    OUTPUT: The smoothed array with filled gaps
    """

    nans = np.isnan(arr)

    x = np.arange(arr.size)[:, np.newaxis]

    x_train = x[~nans]
    y_train = arr[~nans]

    estim = gp.GaussianProcess(nugget=nugget, theta0=theta0)
    estim.fit(x_train, y_train)

    x_hat = x
    y_hat = estim.predict(x_hat)

    if verbose:
        plt.figure(figsize=[10, 6])
        plt.plot(arr, c='k', lw=1, alpha=0.6, label='input')
        plt.plot(y_hat, c='b', lw=1, label='smoothed')
        plt.legend(loc=0)
        plt.show()

    return y_hat


 def fill_gap_with_noise(arr, theta0=1e2, verbose=True):

    nans = np.isnan(arr)
    y_hat = gaussian_smoother(arr, theta0)

    y_dif = arr - y_hat

    y_lims = np.percentile(y_dif[~nans], [5, 95])
    y_tmp = y_dif[~nans]
    i = (y_tmp < y_lims[0]) | (y_tmp > y_lims[1])
    y_tmp[i] = np.nan
    y_tmp = y_tmp[~np.isnan(y_tmp)]
    a, b = norm.fit(y_tmp)
    noise = np.random.normal(a, b, y_hat.size)

    y_modelled_with_noise = y_hat + noise

    y_filled = arr.copy()
    y_filled[nans] = y_modelled_with_noise[nans]

    if verbose:
        plt.figure(figsize=[12, 12])
        ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=2)
        ax2 = plt.subplot2grid((2, 2), (1, 0))
        ax3 = plt.subplot2grid((2, 2), (1, 1))

        ax1.set_title('Time series overlaid with smoothed '
                      'function and noise added in the gaps')
        x = np.arange(arr.size)
        y = y_filled
        y[~nans] = np.nan
        ax1.plot(x, arr,   c='k', lw=2, alpha=0.4, label='input')
        ax1.plot(x, y_hat, c='k', lw=0.7, label='smoothed')
        ax1.plot(x, y, c='r', lw=1.5, label='filled noise')
        ax1.set_xlim(0, arr.size)
        ax1.legend(loc=0)

        ax2.set_title('Difference between smoothed and input')
        ax2.axhline(0, color='k')
        ax2.plot(y_dif, 'ok', ms=2, alpha=0.5)
        ax2.set_xlim(0, arr.size)

        ax3.set_title('Difference plotted as histogram fitted with Gaussian')
        y, x = np.histogram(y_dif[~nans], bins=50)
        x = np.c_[x[:-1], x[1:]].mean(1)
        ax3.bar(x, y, color='gray', lw=0, alpha=0.6, label='histogram')
        ax3 = ax3.twinx()
        xmin, xmax = ax3.get_xlim()
        x = np.linspace(xmin, xmax, 250)
        y = norm.pdf(x, a, b)
        ax3.plot(x, y, lw=1.5, c='r', label='Gaussian PDF')
        ax3.legend()

        plt.tight_layout()

        plt.show()

    return y_filled


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	"""
	A short script that is used to fill gaps in a time series (gap_filler_with_noise).
	The method uses gaussian processes (aka Kriging) to get the trend (calls gaussian_smoother).
	I recommend that you play around with the theta0 value to find the correct scale for the trend.

	Noise is added to the estimated trend by assessing the noise around the trend.

	Diagnostic plots can be created with this script.
	"""

	import numpy as np
	from matplotlib import pyplot as plt
	from sklearn import gaussian_process as gp
	from scipy.stats import norm
	import pandas as pd


	def main():
	pass


	def gaussian_smoother(arr, theta0=2e2, nugget=0.002, verbose=False):
	"""
	Finds a smoother that uses Gaussian Processes to estimate
	the curve. You may need to adjust the nugget and theta0 to
	fit the data to your liking.

	INPUT: numpy array that may contain nans
	[theta0] is the "smoothing factor". Making this number
	smaller will make it smoother.
	More specifically theta0 determines the size
	of the Gaussian in the covariance matrix.
	[nugget] is the noise to add if exact interpolation
	OUTPUT: The smoothed array with filled gaps
	"""

	nans = np.isnan(arr)

	x = np.arange(arr.size)[:, np.newaxis]

	x_train = x[~nans]
	y_train = arr[~nans]

	estim = gp.GaussianProcess(nugget=nugget, theta0=theta0)
	estim.fit(x_train, y_train)

	x_hat = x
	y_hat = estim.predict(x_hat)

	if verbose:
	plt.figure(figsize=[10, 6])
	plt.plot(arr, c='k', lw=1, alpha=0.6, label='input')
	plt.plot(y_hat, c='b', lw=1, label='smoothed')
	plt.legend(loc=0)
	plt.show()

	return y_hat


	def fill_gap_with_noise(arr, theta0=1e2, verbose=True):

	nans = np.isnan(arr)
	y_hat = gaussian_smoother(arr, theta0)

	y_dif = arr - y_hat

	y_lims = np.percentile(y_dif[~nans], [5, 95])
	y_tmp = y_dif[~nans]
	i = (y_tmp < y_lims[0]) \| (y_tmp > y_lims[1])
	y_tmp[i] = np.nan
	y_tmp = y_tmp[~np.isnan(y_tmp)]
	a, b = norm.fit(y_tmp)
	noise = np.random.normal(a, b, y_hat.size)

	y_modelled_with_noise = y_hat + noise

	y_filled = arr.copy()
	y_filled[nans] = y_modelled_with_noise[nans]

	if verbose:
	plt.figure(figsize=[12, 12])
	ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=2)
	ax2 = plt.subplot2grid((2, 2), (1, 0))
	ax3 = plt.subplot2grid((2, 2), (1, 1))

	ax1.set_title('Time series overlaid with smoothed '
	'function and noise added in the gaps')
	x = np.arange(arr.size)
	y = y_filled
	y[~nans] = np.nan
	ax1.plot(x, arr, c='k', lw=2, alpha=0.4, label='input')
	ax1.plot(x, y_hat, c='k', lw=0.7, label='smoothed')
	ax1.plot(x, y, c='r', lw=1.5, label='filled noise')
	ax1.set_xlim(0, arr.size)
	ax1.legend(loc=0)

	ax2.set_title('Difference between smoothed and input')
	ax2.axhline(0, color='k')
	ax2.plot(y_dif, 'ok', ms=2, alpha=0.5)
	ax2.set_xlim(0, arr.size)

	ax3.set_title('Difference plotted as histogram fitted with Gaussian')
	y, x = np.histogram(y_dif[~nans], bins=50)
	x = np.c_[x[:-1], x[1:]].mean(1)
	ax3.bar(x, y, color='gray', lw=0, alpha=0.6, label='histogram')
	ax3 = ax3.twinx()
	xmin, xmax = ax3.get_xlim()
	x = np.linspace(xmin, xmax, 250)
	y = norm.pdf(x, a, b)
	ax3.plot(x, y, lw=1.5, c='r', label='Gaussian PDF')
	ax3.legend()

	plt.tight_layout()

	plt.show()

	return y_filled


	if __name__ == '__main__':
	main()