Created
May 3, 2018 16:34
-
-
Save benman1/f314b85ab4d45c34d55bab7e756d00a5 to your computer and use it in GitHub Desktop.
estimate distribution with scipy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# found at https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python | |
%matplotlib inline | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import scipy.stats as st | |
import statsmodels as sm | |
import matplotlib | |
import matplotlib.pyplot as plt | |
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0) | |
matplotlib.style.use('ggplot') | |
# Create models from data | |
def best_fit_distribution(data, bins=200, ax=None): | |
"""Model data by finding best fit distribution to data""" | |
# Get histogram of original data | |
y, x = np.histogram(data, bins=bins, density=True) | |
x = (x + np.roll(x, -1))[:-1] / 2.0 | |
# Distributions to check | |
DISTRIBUTIONS = [ | |
st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine, | |
st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk, | |
st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon, | |
st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r, | |
st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss, | |
st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable, | |
st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf, | |
st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal, | |
st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda, | |
st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy | |
] | |
# Best holders | |
best_distribution = st.norm | |
best_params = (0.0, 1.0) | |
best_sse = np.inf | |
# Estimate distribution parameters from data | |
for distribution in DISTRIBUTIONS: | |
# Try to fit the distribution | |
try: | |
# Ignore warnings from data that can't be fit | |
with warnings.catch_warnings(): | |
warnings.filterwarnings('ignore') | |
# fit dist to data | |
params = distribution.fit(data) | |
# Separate parts of parameters | |
arg = params[:-2] | |
loc = params[-2] | |
scale = params[-1] | |
# Calculate fitted PDF and error with fit in distribution | |
pdf = distribution.pdf(x, loc=loc, scale=scale, *arg) | |
sse = np.sum(np.power(y - pdf, 2.0)) | |
# if axis pass in add to plot | |
try: | |
if ax: | |
pd.Series(pdf, x).plot(ax=ax) | |
end | |
except Exception: | |
pass | |
# identify if this distribution is better | |
if best_sse > sse > 0: | |
best_distribution = distribution | |
best_params = params | |
best_sse = sse | |
except Exception: | |
pass | |
return (best_distribution.name, best_params) | |
def make_pdf(dist, params, size=10000): | |
"""Generate distributions's Propbability Distribution Function """ | |
# Separate parts of parameters | |
arg = params[:-2] | |
loc = params[-2] | |
scale = params[-1] | |
# Get sane start and end points of distribution | |
start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale) | |
end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale) | |
# Build PDF and turn into pandas Series | |
x = np.linspace(start, end, size) | |
y = dist.pdf(x, loc=loc, scale=scale, *arg) | |
pdf = pd.Series(y, x) | |
return pdf | |
# Load data from statsmodels datasets | |
data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel()) | |
# Plot for comparison | |
plt.figure(figsize=(12,8)) | |
ax = data.plot(kind='hist', bins=50, normed=True, alpha=0.5, color=plt.rcParams['axes.color_cycle'][1]) | |
# Save plot limits | |
dataYLim = ax.get_ylim() | |
# Find best fit distribution | |
best_fit_name, best_fir_paramms = best_fit_distribution(data, 200, ax) | |
best_dist = getattr(st, best_fit_name) | |
# Update plots | |
ax.set_ylim(dataYLim) | |
ax.set_title(u'El Niño sea temp.\n All Fitted Distributions') | |
ax.set_xlabel(u'Temp (°C)') | |
ax.set_ylabel('Frequency') | |
# Make PDF | |
pdf = make_pdf(best_dist, best_fir_paramms) | |
# Display | |
plt.figure(figsize=(12,8)) | |
ax = pdf.plot(lw=2, label='PDF', legend=True) | |
data.plot(kind='hist', bins=50, normed=True, alpha=0.5, label='Data', legend=True, ax=ax) | |
param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale'] | |
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fir_paramms)]) | |
dist_str = '{}({})'.format(best_fit_name, param_str) | |
ax.set_title(u'El Niño sea temp. with best fit distribution \n' + dist_str) | |
ax.set_xlabel(u'Temp. (°C)') | |
ax.set_ylabel('Frequency') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment