Created
April 1, 2016 04:35
-
-
Save andyfaff/9a7930f3d93a99f478cdadae4d1bb3b2 to your computer and use it in GitHub Desktop.
scipy.stats sampling from arbitrary distribution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.interpolate as interpolate | |
from scipy.integrate import simps | |
from scipy.stats import rv_continuous | |
class arbitrary(rv_continuous): | |
def __init__(self, hist, bin_edges, xtol=1e-14, seed=None): | |
_hist = np.array(hist, float) | |
_bin_edges = np.asfarray(bin_edges) | |
if _bin_edges.size - 1 != hist.size: | |
raise ValueError("For a histogram the bins_edges array must be 1" | |
" larger than data.") | |
# need to normalise to density | |
# perhaps should use different integration method. If there are | |
# only a couple of samples then simple rectangular integration | |
# may be more applicable, etc. | |
bin_centres = (_bin_edges[1:] + _bin_edges[:-1]) * 0.5 | |
total_area = simps(_hist, bin_centres) | |
_hist /= total_area | |
self._hist, self._bin_edges = _hist, _bin_edges | |
cum_values = np.zeros(bin_edges.shape) | |
cum_values[1:] = np.cumsum(_hist * np.diff(_bin_edges)) | |
self._cum_values = cum_values | |
self._inv_cdf = interpolate.interp1d(cum_values, _bin_edges) | |
super(arbitrary, self).__init__(a=self._bin_edges[0], | |
b=self._bin_edges[-1], | |
xtol=xtol, seed=seed) | |
def _parse_args_rvs(*args, **kwds): | |
return args, 0, 1, kwds['size'] | |
def _parse_args(*args, **kwds): | |
return args, 0, 1 | |
def _parse_args_stats(*args, **kwds): | |
return args, 0, 1, kwds['moments'] | |
ns = {'_parse_args_rvs': _parse_args_rvs, | |
'_parse_args': _parse_args, | |
'_parse_args_stats': _parse_args_stats} | |
for name, func in ns.items(): | |
setattr(self, name, func) | |
def _pdf(self, x): | |
# find out where x is in _bin_edges | |
loc = np.searchsorted(self._bin_edges, x) | |
if loc == 0 or loc == self._bin_edges.size: | |
# x wasn't in the support of the data | |
return 0 | |
else: | |
# x is somewhere in the histogram | |
return self._hist[loc - 1] | |
def _cdf(self, x): | |
return self._cum_values(x) | |
def _ppf(self, x): | |
return self._inv_cdf(x) | |
def fit(self, data, *args, **kwds): | |
raise NotImplementedError("fit method not supported for arbitrary" | |
" distribution") | |
def fit_loc_scale(self, data, *args): | |
raise NotImplementedError("fit_loc_scale not supported for arbitrary" | |
" distribution") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A couple of quick comments on technicalities: I think you don't need to redo constructing
_parse...
functions, it'll happen automatically in the superclass constructor. The in-bounds check is also done in the superclass (RV_continuous.PDF method) as long as you have a and b attributes defined.W.r.t. interpolation, are you certain you want linear interpolation and not e.g. monotone splines (Pchip or akima)?