Skip to content

Instantly share code, notes, and snippets.

@PierreBdR
Last active August 29, 2015 14:05
Show Gist options
  • Save PierreBdR/90e7f6dd72de13dcf9ee to your computer and use it in GitHub Desktop.
Save PierreBdR/90e7f6dd72de13dcf9ee to your computer and use it in GitHub Desktop.
Speed and accuracy test for KDE -- statsmodels vs. pyqt_fit
"""
This module is based on the existing gist of @Padarn here:
https://gist.github.com/Padarn/72d3c0492b7608c4bd7d
"""
from __future__ import division, print_function
import numpy as np
import statsmodels.api as sm
import statsmodels.nonparametric.bandwidths as bandwidths
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.nonparametric.kdetools import (forrt, revrt, silverman_transform, counts)
from statsmodels.nonparametric.linbin import fast_linbin, fast_linbin_weights
import time
from pyqt_fit import kde, kde_methods
from pyqt_fit import __version__ as pyqt_version
pyqt_version = tuple(int(i) for i in pyqt_version.split('.'))
assert pyqt_version >= (1,3,3) # Make sure it's recent enough
kernel_switch = dict(gau=kernels.Gaussian, epa=kernels.Epanechnikov,
uni=kernels.Uniform, tri=kernels.Triangular,
biw=kernels.Biweight, triw=kernels.Triweight,
cos=kernels.Cosine, cos2=kernels.Cosine2)
##### 3 VERSIONS TO TEST ########
## VERSION 1 - newest version
def kdensityfft_v1(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
adjust=1, cut=3, retgrid=True):
# Not convinced this is neccessary
X = np.asarray(X)
# Get kernel object corresponding to selection
kern = kernel_switch[kernel]()
# This kernel selection should be moved outside of this function.
# bw should be required as as float to this function.
try:
bw = float(bw)
except:
# will cross-val fit this pattern?
bw = bandwidths.select_bandwidth(X, bw, kern)
bw *= adjust
nobs = float(len(X)) # after trim
# step 1 Make grid and discretize the data
if gridsize is None:
# not convinced this is correct
gridsize = np.max((nobs, 512.))
# round to next power of 2
gridsize = 2 ** np.ceil(np.log2(gridsize))
#print("Grid size = {0}".format(gridsize))
a = np.min(X) - cut * bw
b = np.max(X) + cut * bw
grid, delta = np.linspace(a, b, gridsize, retstep=True)
RANGE = b - a
# Calculate the scaled bin counts with linear-binning
if weights is None:
binned = fast_linbin(X, a, b, gridsize)
# handle weighted observations
else:
# ensure weights is a numpy array
weights = np.asarray(weights)
if len(weights) != len(X):
msg = "The length of the weights must be the same as the given X."
raise ValueError(msg)
q = weights.mean()
weights = weights / q
#weights = np.ones(nobs)
binned = fast_linbin_weights(X, weights, a, b, gridsize)
# step 2 compute weights
M = gridsize
if kern.domain is None:
L = M - 1
else:
tau = kern.domain[1] # assumes support is symmetric.
L = min(np.floor(tau * bw * (M - 1) / RANGE), M - 1)
l = np.arange(0, L + 1)
kappa = kern((b - a) * l / (bw * (M - 1)))
kappa = 1.0 / (nobs * bw) * kappa
# step 3 create padded arrays for fourier transform
P = 2 ** np.ceil(np.log2(gridsize + L))
c = np.concatenate([binned, np.zeros(P - M)])
k = np.concatenate([kappa, np.zeros(P - 2 * L - 1), kappa[::-1][:-1]])
# step 4 convolve using fourier transform
z = np.fft.rfft(c) * np.fft.rfft(k)
f = np.fft.irfft(z)[:M]
if retgrid:
return f, grid, bw
else:
return f, bw
## VERSION 2 - old version
def kdensityfft_v2(X, kernel="gau", bw="scott", weights=None, gridsize=None,
adjust=1, cut=3, retgrid=True):
X = np.asarray(X)
try:
bw = float(bw)
except:
bw = bandwidths.select_bandwidth(X, bw, kernel) # will cross-val fit this pattern?
bw *= adjust
nobs = float(len(X)) # after trim
# 1 Make grid and discretize the data
if gridsize == None:
gridsize = np.max((nobs,512.))
gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2
#print("Grid size = {0}".format(gridsize))
a = np.min(X)-cut*bw
b = np.max(X)+cut*bw
grid,delta = np.linspace(a,b,gridsize,retstep=True)
RANGE = b-a
#TODO: Fix this?
# This is the Silverman binning function, but I believe it's buggy (SS)
# weighting according to Silverman
# count = counts(X,grid)
# binned = np.zeros_like(grid) #xi_{k} in Silverman
# j = 0
# for k in range(int(gridsize-1)):
# if count[k]>0: # there are points of X in the grid here
# Xingrid = X[j:j+count[k]] # get all these points
# # get weights at grid[k],grid[k+1]
# binned[k] += np.sum(grid[k+1]-Xingrid)
# binned[k+1] += np.sum(Xingrid-grid[k])
# j += count[k]
# binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta
#NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING
##TEMPFIX: If X is dtype=long, this fails - so cast to double.
if X.dtype==int:
X = X.astype(float)
binned = fast_linbin(X,a,b,gridsize)/(delta*nobs)
# step 2 compute FFT of the weights, using Munro (1976) FFT convention
y = forrt(binned)
# step 3 and 4 for optimal bw compute zstar and the density estimate f
# don't have to redo the above if just changing bw, ie., for cross val
#NOTE: silverman_transform is the closed form solution of the FFT of the
#gaussian kernel. Not yet sure how to generalize it.
zstar = silverman_transform(bw, gridsize, RANGE)*y # 3.49 in Silverman
# 3.50 w Gaussian kernel
f = revrt(zstar)
if retgrid:
return f, grid, bw
else:
return f, bw
## VERSION 3 - pyqt_fit
def kdensityfft_v3(X, bw, cyclic=True):
X = np.asarray(X)
k = kde.KDE1D(X, bandwidth=bw)
k.fit()
#k.lower = X.min() - 3*k.bandwidth
#k.upper = X.max() + 3*k.bandwidth
if cyclic:
k.method = kde_methods.cyclic
else:
k.method = kde_methods.reflection
nobs = float(len(k.xdata))
k.fit()
xs, ys = k.grid(nobs, cut=3)
#print("Grid size = {0}".format(len(xs)))
return xs, ys, k.bandwidth
#----------------------------
n = 1e7
X = np.random.randn(n)
kern = kernel_switch["gau"]()
bw = bandwidths.select_bandwidth(X, 'normal_reference', kern)
#------- Loop over 3 times ------
nloop = 3
start_time = time.time()
for i in range(nloop):
y1, x1, bw1 = kdensityfft_v1(X, bw=bw)
v1average = (time.time()-start_time)/nloop
print("n=",n,"average time v1:", v1average, "seconds")
start_time = time.time()
for i in range(nloop):
y2, x2, bw2 = kdensityfft_v2(X, bw=bw)
v2average = (time.time()-start_time)/nloop
print("n=",n,"average time v2:", v2average, "seconds")
start_time = time.time()
for i in range(nloop):
x3, y3, bw3 = kdensityfft_v3(X, bw=bw, cyclic=True)
v3average = (time.time()-start_time)/nloop
print("n=",n,"average time v3 (fft):", v3average, "seconds")
start_time = time.time()
for i in range(nloop):
x4, y4, bw4 = kdensityfft_v3(X, bw=bw, cyclic=False)
v4average = (time.time()-start_time)/nloop
print("n=",n,"average time v4 (dct):", v4average, "seconds")
print('bw1 = {0} -- bw2 = {1} -- bw3 = {2}'.format(bw1, bw2, bw3))
print('x -> [{0} ; {1}]'.format(x2.min(), x2.max()))
iy1 = np.interp(x2, x1, y1)
dy = (iy1 - y2)
print('Absolute error iy1 -- y2 ~ {}'.format(np.dot(dy, dy)))
dy /= abs(y2)
print('Relative error iy1 -- y2 ~ {}\n'.format(np.dot(dy, dy)))
iy3 = np.interp(x2, x3, y3)
dy = (iy3 - y2)
print('Absolute error iy3 -- y2 ~ {}'.format(np.dot(dy, dy)))
dy /= abs(y2)
print('Relative error iy3 -- y2 ~ {}\n'.format(np.dot(dy, dy)))
iy4 = np.interp(x2, x4, y4)
dy = (iy4 - y2)
print('Absolute error iy4 -- y2 ~ {}'.format(np.dot(dy, dy)))
dy /= abs(y2)
print('Relative error iy4 -- y2 ~ {}\n'.format(np.dot(dy, dy)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment