Last active
August 29, 2015 14:05
-
-
Save PierreBdR/90e7f6dd72de13dcf9ee to your computer and use it in GitHub Desktop.
Speed and accuracy test for KDE -- statsmodels vs. pyqt_fit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module is based on the existing gist of @Padarn here: | |
https://gist.github.com/Padarn/72d3c0492b7608c4bd7d | |
""" | |
from __future__ import division, print_function | |
import numpy as np | |
import statsmodels.api as sm | |
import statsmodels.nonparametric.bandwidths as bandwidths | |
from statsmodels.sandbox.nonparametric import kernels | |
from statsmodels.nonparametric.kdetools import (forrt, revrt, silverman_transform, counts) | |
from statsmodels.nonparametric.linbin import fast_linbin, fast_linbin_weights | |
import time | |
from pyqt_fit import kde, kde_methods | |
from pyqt_fit import __version__ as pyqt_version | |
pyqt_version = tuple(int(i) for i in pyqt_version.split('.')) | |
assert pyqt_version >= (1,3,3) # Make sure it's recent enough | |
kernel_switch = dict(gau=kernels.Gaussian, epa=kernels.Epanechnikov, | |
uni=kernels.Uniform, tri=kernels.Triangular, | |
biw=kernels.Biweight, triw=kernels.Triweight, | |
cos=kernels.Cosine, cos2=kernels.Cosine2) | |
##### 3 VERSIONS TO TEST ######## | |
## VERSION 1 - newest version | |
def kdensityfft_v1(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, | |
adjust=1, cut=3, retgrid=True): | |
# Not convinced this is neccessary | |
X = np.asarray(X) | |
# Get kernel object corresponding to selection | |
kern = kernel_switch[kernel]() | |
# This kernel selection should be moved outside of this function. | |
# bw should be required as as float to this function. | |
try: | |
bw = float(bw) | |
except: | |
# will cross-val fit this pattern? | |
bw = bandwidths.select_bandwidth(X, bw, kern) | |
bw *= adjust | |
nobs = float(len(X)) # after trim | |
# step 1 Make grid and discretize the data | |
if gridsize is None: | |
# not convinced this is correct | |
gridsize = np.max((nobs, 512.)) | |
# round to next power of 2 | |
gridsize = 2 ** np.ceil(np.log2(gridsize)) | |
#print("Grid size = {0}".format(gridsize)) | |
a = np.min(X) - cut * bw | |
b = np.max(X) + cut * bw | |
grid, delta = np.linspace(a, b, gridsize, retstep=True) | |
RANGE = b - a | |
# Calculate the scaled bin counts with linear-binning | |
if weights is None: | |
binned = fast_linbin(X, a, b, gridsize) | |
# handle weighted observations | |
else: | |
# ensure weights is a numpy array | |
weights = np.asarray(weights) | |
if len(weights) != len(X): | |
msg = "The length of the weights must be the same as the given X." | |
raise ValueError(msg) | |
q = weights.mean() | |
weights = weights / q | |
#weights = np.ones(nobs) | |
binned = fast_linbin_weights(X, weights, a, b, gridsize) | |
# step 2 compute weights | |
M = gridsize | |
if kern.domain is None: | |
L = M - 1 | |
else: | |
tau = kern.domain[1] # assumes support is symmetric. | |
L = min(np.floor(tau * bw * (M - 1) / RANGE), M - 1) | |
l = np.arange(0, L + 1) | |
kappa = kern((b - a) * l / (bw * (M - 1))) | |
kappa = 1.0 / (nobs * bw) * kappa | |
# step 3 create padded arrays for fourier transform | |
P = 2 ** np.ceil(np.log2(gridsize + L)) | |
c = np.concatenate([binned, np.zeros(P - M)]) | |
k = np.concatenate([kappa, np.zeros(P - 2 * L - 1), kappa[::-1][:-1]]) | |
# step 4 convolve using fourier transform | |
z = np.fft.rfft(c) * np.fft.rfft(k) | |
f = np.fft.irfft(z)[:M] | |
if retgrid: | |
return f, grid, bw | |
else: | |
return f, bw | |
## VERSION 2 - old version | |
def kdensityfft_v2(X, kernel="gau", bw="scott", weights=None, gridsize=None, | |
adjust=1, cut=3, retgrid=True): | |
X = np.asarray(X) | |
try: | |
bw = float(bw) | |
except: | |
bw = bandwidths.select_bandwidth(X, bw, kernel) # will cross-val fit this pattern? | |
bw *= adjust | |
nobs = float(len(X)) # after trim | |
# 1 Make grid and discretize the data | |
if gridsize == None: | |
gridsize = np.max((nobs,512.)) | |
gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2 | |
#print("Grid size = {0}".format(gridsize)) | |
a = np.min(X)-cut*bw | |
b = np.max(X)+cut*bw | |
grid,delta = np.linspace(a,b,gridsize,retstep=True) | |
RANGE = b-a | |
#TODO: Fix this? | |
# This is the Silverman binning function, but I believe it's buggy (SS) | |
# weighting according to Silverman | |
# count = counts(X,grid) | |
# binned = np.zeros_like(grid) #xi_{k} in Silverman | |
# j = 0 | |
# for k in range(int(gridsize-1)): | |
# if count[k]>0: # there are points of X in the grid here | |
# Xingrid = X[j:j+count[k]] # get all these points | |
# # get weights at grid[k],grid[k+1] | |
# binned[k] += np.sum(grid[k+1]-Xingrid) | |
# binned[k+1] += np.sum(Xingrid-grid[k]) | |
# j += count[k] | |
# binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta | |
#NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING | |
##TEMPFIX: If X is dtype=long, this fails - so cast to double. | |
if X.dtype==int: | |
X = X.astype(float) | |
binned = fast_linbin(X,a,b,gridsize)/(delta*nobs) | |
# step 2 compute FFT of the weights, using Munro (1976) FFT convention | |
y = forrt(binned) | |
# step 3 and 4 for optimal bw compute zstar and the density estimate f | |
# don't have to redo the above if just changing bw, ie., for cross val | |
#NOTE: silverman_transform is the closed form solution of the FFT of the | |
#gaussian kernel. Not yet sure how to generalize it. | |
zstar = silverman_transform(bw, gridsize, RANGE)*y # 3.49 in Silverman | |
# 3.50 w Gaussian kernel | |
f = revrt(zstar) | |
if retgrid: | |
return f, grid, bw | |
else: | |
return f, bw | |
## VERSION 3 - pyqt_fit | |
def kdensityfft_v3(X, bw, cyclic=True): | |
X = np.asarray(X) | |
k = kde.KDE1D(X, bandwidth=bw) | |
k.fit() | |
#k.lower = X.min() - 3*k.bandwidth | |
#k.upper = X.max() + 3*k.bandwidth | |
if cyclic: | |
k.method = kde_methods.cyclic | |
else: | |
k.method = kde_methods.reflection | |
nobs = float(len(k.xdata)) | |
k.fit() | |
xs, ys = k.grid(nobs, cut=3) | |
#print("Grid size = {0}".format(len(xs))) | |
return xs, ys, k.bandwidth | |
#---------------------------- | |
n = 1e7 | |
X = np.random.randn(n) | |
kern = kernel_switch["gau"]() | |
bw = bandwidths.select_bandwidth(X, 'normal_reference', kern) | |
#------- Loop over 3 times ------ | |
nloop = 3 | |
start_time = time.time() | |
for i in range(nloop): | |
y1, x1, bw1 = kdensityfft_v1(X, bw=bw) | |
v1average = (time.time()-start_time)/nloop | |
print("n=",n,"average time v1:", v1average, "seconds") | |
start_time = time.time() | |
for i in range(nloop): | |
y2, x2, bw2 = kdensityfft_v2(X, bw=bw) | |
v2average = (time.time()-start_time)/nloop | |
print("n=",n,"average time v2:", v2average, "seconds") | |
start_time = time.time() | |
for i in range(nloop): | |
x3, y3, bw3 = kdensityfft_v3(X, bw=bw, cyclic=True) | |
v3average = (time.time()-start_time)/nloop | |
print("n=",n,"average time v3 (fft):", v3average, "seconds") | |
start_time = time.time() | |
for i in range(nloop): | |
x4, y4, bw4 = kdensityfft_v3(X, bw=bw, cyclic=False) | |
v4average = (time.time()-start_time)/nloop | |
print("n=",n,"average time v4 (dct):", v4average, "seconds") | |
print('bw1 = {0} -- bw2 = {1} -- bw3 = {2}'.format(bw1, bw2, bw3)) | |
print('x -> [{0} ; {1}]'.format(x2.min(), x2.max())) | |
iy1 = np.interp(x2, x1, y1) | |
dy = (iy1 - y2) | |
print('Absolute error iy1 -- y2 ~ {}'.format(np.dot(dy, dy))) | |
dy /= abs(y2) | |
print('Relative error iy1 -- y2 ~ {}\n'.format(np.dot(dy, dy))) | |
iy3 = np.interp(x2, x3, y3) | |
dy = (iy3 - y2) | |
print('Absolute error iy3 -- y2 ~ {}'.format(np.dot(dy, dy))) | |
dy /= abs(y2) | |
print('Relative error iy3 -- y2 ~ {}\n'.format(np.dot(dy, dy))) | |
iy4 = np.interp(x2, x4, y4) | |
dy = (iy4 - y2) | |
print('Absolute error iy4 -- y2 ~ {}'.format(np.dot(dy, dy))) | |
dy /= abs(y2) | |
print('Relative error iy4 -- y2 ~ {}\n'.format(np.dot(dy, dy))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment