-
-
Save brentp/089c7d6d69d78d26437f to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
u""" | |
Beta regression for modeling rates and proportions. | |
References | |
---------- | |
Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression | |
in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in | |
Economics and Statistics, 2011. | |
Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer? | |
Maximum-likelihood regression with beta-distributed dependent variables." | |
Psychological methods 11.1 (2006): 54. | |
""" | |
import numpy as np | |
import pandas as pd | |
import statsmodels.api as sm | |
from scipy.special import gammaln as lgamma | |
from statsmodels.base.model import GenericLikelihoodModel | |
from statsmodels.genmod.families import Binomial | |
# this is only need while #2024 is open. | |
class Logit(sm.families.links.Logit): | |
"""Logit tranform that won't overflow with large numbers.""" | |
def inverse(self, z): | |
return 1 / (1. + np.exp(-z)) | |
_init_example = """ | |
Beta regression with default of logit-link for exog and log-link | |
for precision. | |
>>> mod = Beta(endog, exog) | |
>>> rslt = mod.fit() | |
>>> print rslt.summary() | |
We can also specify a formula and a specific structure and use the | |
identity-link for phi. | |
>>> from sm.families.links import identity | |
>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
>>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', | |
... dat, Z=Z, link_phi=identity()) | |
In the case of proportion-data, we may think that the precision depends on | |
the number of measurements. E.g for sequence data, on the number of | |
sequence reads covering a site: | |
>>> Z = patsy.dmatrix('~ coverage', df) | |
>>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z) | |
>>> rslt = mod.fit() | |
""" | |
class Beta(GenericLikelihoodModel): | |
"""Beta Regression. | |
This implementation uses `phi` as a precision parameter equal to | |
`a + b` from the Beta parameters. | |
""" | |
def __init__(self, endog, exog, Z=None, link=Logit(), | |
link_phi=sm.families.links.Log(), **kwds): | |
""" | |
Parameters | |
---------- | |
endog : array-like | |
1d array of endogenous values (i.e. responses, outcomes, | |
dependent variables, or 'Y' values). | |
exog : array-like | |
2d array of exogeneous values (i.e. covariates, predictors, | |
independent variables, regressors, or 'X' values). A nobs x k | |
array where `nobs` is the number of observations and `k` is | |
the number of regressors. An intercept is not included by | |
default and should be added by the user. See | |
`statsmodels.tools.add_constant`. | |
Z : array-like | |
2d array of variables for the precision phi. | |
link : link | |
Any link in sm.families.links for `exog` | |
link_phi : link | |
Any link in sm.families.links for `Z` | |
Examples | |
-------- | |
{example} | |
See Also | |
-------- | |
:ref:`links` | |
""".format(example=_init_example) | |
assert np.all((0 < endog) & (endog < 1)) | |
if Z is None: | |
extra_names = ['phi'] | |
Z = np.ones((len(endog), 1), dtype='f') | |
else: | |
extra_names = ['precision-%s' % zc for zc in \ | |
(Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))] | |
kwds['extra_params_names'] = extra_names | |
super(Beta, self).__init__(endog, exog, **kwds) | |
self.link = link | |
self.link_phi = link_phi | |
self.Z = Z | |
assert len(self.Z) == len(self.endog) | |
def nloglikeobs(self, params): | |
""" | |
Negative log-likelihood. | |
Parameters | |
---------- | |
params : np.ndarray | |
Parameter estimates | |
""" | |
return -self._ll_br(self.endog, self.exog, self.Z, params) | |
def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False, | |
method='bfgs', **kwds): | |
""" | |
Fit the model. | |
Parameters | |
---------- | |
start_params : array-like | |
A vector of starting values for the regression | |
coefficients. If None, a default is chosen. | |
maxiter : integer | |
The maximum number of iterations | |
disp : bool | |
Show convergence stats. | |
method : str | |
The optimization method to use. | |
""" | |
if start_params is None: | |
start_params = sm.GLM(self.endog, self.exog, family=Binomial() | |
).fit(disp=False).params | |
start_params = np.append(start_params, [0.5] * self.Z.shape[1]) | |
return super(Beta, self).fit(start_params=start_params, | |
maxiter=maxiter, maxfun=maxfun, | |
method=method, disp=disp, **kwds) | |
def _ll_br(self, y, X, Z, params): | |
nz = self.Z.shape[1] | |
Xparams = params[:-nz] | |
Zparams = params[-nz:] | |
mu = self.link.inverse(np.dot(X, Xparams)) | |
phi = self.link_phi.inverse(np.dot(Z, Zparams)) | |
# TODO: derive a and b and constrain to > 0? | |
if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf) | |
ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \ | |
+ (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \ | |
* np.log(1 - y) | |
return ll | |
if __name__ == "__main__": | |
import patsy | |
dat = pd.read_table('gasoline.txt') | |
Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
# using other precison params with | |
m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat, | |
Z=Z, link_phi=sm.families.links.identity()) | |
print m.fit().summary() | |
fex = pd.read_csv('foodexpenditure.csv') | |
m = Beta.from_formula(' I(food/income) ~ income + persons', fex) | |
print m.fit().summary() | |
#print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary() | |
dev = pd.read_csv('methylation-test.csv') | |
Z = patsy.dmatrix('~ age', dev, return_type='dataframe') | |
m = Beta.from_formula('methylation ~ gender + CpG', dev, | |
Z=Z, | |
link_phi=sm.families.links.identity()) | |
print m.fit().summary() |
library(betareg) | |
data("GasolineYield", package = "betareg") | |
data("FoodExpenditure", package = "betareg") | |
#fe_beta = betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
#print(summary(fe_beta)$coefficients) | |
#m = betareg(yield ~ batch + temp, data = GasolineYield) | |
#print(summary(m)) | |
#gy2 <- betareg(yield ~ batch + temp | temp, data = GasolineYield, link.phi="identity") | |
#print(summary(gy2)) | |
meth = read.csv('methylation-test.csv') | |
m = betareg(methylation ~ gender + CpG | age, meth) | |
print(summary(m)) | |
food | income | persons | |
---|---|---|---|
15.998 | 62.476 | 1 | |
16.652 | 82.304 | 5 | |
21.741 | 74.679 | 3 | |
7.431 | 39.151 | 3 | |
10.481 | 64.724 | 5 | |
13.548 | 36.786 | 3 | |
23.256 | 83.052 | 4 | |
17.976 | 86.935 | 1 | |
14.161 | 88.233 | 2 | |
8.825 | 38.695 | 2 | |
14.184 | 73.831 | 7 | |
19.604 | 77.122 | 3 | |
13.728 | 45.519 | 2 | |
21.141 | 82.251 | 2 | |
17.446 | 59.862 | 3 | |
9.629 | 26.563 | 3 | |
14.005 | 61.818 | 2 | |
9.16 | 29.682 | 1 | |
18.831 | 50.825 | 5 | |
7.641 | 71.062 | 4 | |
13.882 | 41.99 | 4 | |
9.67 | 37.324 | 3 | |
21.604 | 86.352 | 5 | |
10.866 | 45.506 | 2 | |
28.98 | 69.929 | 6 | |
10.882 | 61.041 | 2 | |
18.561 | 82.469 | 1 | |
11.629 | 44.208 | 2 | |
18.067 | 49.467 | 5 | |
14.539 | 25.905 | 5 | |
19.192 | 79.178 | 5 | |
25.918 | 75.811 | 3 | |
28.833 | 82.718 | 6 | |
15.869 | 48.311 | 4 | |
14.91 | 42.494 | 5 | |
9.55 | 40.573 | 4 | |
23.066 | 44.872 | 6 | |
14.751 | 27.167 | 7 |
iyield gravity pressure temp10 temp batch | |
0.122 50.8 8.6 190 205 1 | |
0.223 50.8 8.6 190 275 1 | |
0.347 50.8 8.6 190 345 1 | |
0.457 50.8 8.6 190 407 1 | |
0.08 40.8 3.5 210 218 2 | |
0.131 40.8 3.5 210 273 2 | |
0.266 40.8 3.5 210 347 2 | |
0.074 40 6.1 217 212 3 | |
0.182 40 6.1 217 272 3 | |
0.304 40 6.1 217 340 3 | |
0.069 38.4 6.1 220 235 4 | |
0.152 38.4 6.1 220 300 4 | |
0.26 38.4 6.1 220 365 4 | |
0.336 38.4 6.1 220 410 4 | |
0.144 40.3 4.8 231 307 5 | |
0.268 40.3 4.8 231 367 5 | |
0.349 40.3 4.8 231 395 5 | |
0.1 32.2 5.2 236 267 6 | |
0.248 32.2 5.2 236 360 6 | |
0.317 32.2 5.2 236 402 6 | |
0.028 41.3 1.8 267 235 7 | |
0.064 41.3 1.8 267 275 7 | |
0.161 41.3 1.8 267 358 7 | |
0.278 41.3 1.8 267 416 7 | |
0.05 38.1 1.2 274 285 8 | |
0.176 38.1 1.2 274 365 8 | |
0.321 38.1 1.2 274 444 8 | |
0.14 32.2 2.4 284 351 9 | |
0.232 32.2 2.4 284 424 9 | |
0.085 31.8 0.2 316 365 10 | |
0.147 31.8 0.2 316 379 10 | |
0.18 31.8 0.2 316 428 10 |
Unnamed: 0 | gender | age | Basename | ID | id | plate | CpG | methylation | ||
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_0 | 0.815 | |
1 | 1 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_0 | 0.803 | |
2 | 2 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_0 | 0.803 | |
3 | 3 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_0 | 0.808 | |
4 | 4 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_0 | 0.8550000000000001 | |
5 | 5 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_0 | 0.8129999999999998 | |
6 | 6 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_0 | 0.816 | |
7 | 7 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_0 | 0.827 | |
8 | 8 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_0 | 0.829 | |
9 | 9 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_0 | 0.7760000000000001 | |
10 | 10 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_0 | 0.7859999999999999 | |
11 | 11 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_0 | 0.822 | |
12 | 12 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_1 | 0.891 | |
13 | 13 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_1 | 0.894 | |
14 | 14 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_1 | 0.894 | |
15 | 15 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_1 | 0.869 | |
16 | 16 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_1 | 0.914 | |
17 | 17 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_1 | 0.889 | |
18 | 18 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_1 | 0.8850000000000001 | |
19 | 19 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_1 | 0.898 | |
20 | 20 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_1 | 0.896 | |
21 | 21 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_1 | 0.86 | |
22 | 22 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_1 | 0.887 | |
23 | 23 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_1 | 0.8800000000000001 | |
24 | 24 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_2 | 0.936 | |
25 | 25 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_2 | 0.9129999999999999 | |
26 | 26 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_2 | 0.9000000000000001 | |
27 | 27 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_2 | 0.9119999999999999 | |
28 | 28 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_2 | 0.9349999999999999 | |
29 | 29 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_2 | 0.9280000000000002 | |
30 | 30 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_2 | 0.9150000000000001 | |
31 | 31 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_2 | 0.9160000000000001 | |
32 | 32 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_2 | 0.929 | |
33 | 33 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_2 | 0.92 | |
34 | 34 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_2 | 0.9160000000000001 | |
35 | 35 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_2 | 0.926 |
# in R: | |
import io | |
import statsmodels.api as sm | |
import pandas as pd | |
import patsy | |
from betareg import Beta | |
import numpy as np | |
# betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
_income_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03 | |
income -0.01229884 0.003035585 -4.051556 5.087819e-05 | |
persons 0.11846210 0.035340667 3.352005 8.022853e-04""" | |
_income_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(phi) 35.60975 8.079598 4.407366 1.046351e-05 | |
""" | |
_methylation_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 1.44224 0.03401 42.404 2e-16 | |
genderM 0.06986 0.04359 1.603 0.109 | |
CpGCpG_1 0.60735 0.04834 12.563 2e-16 | |
CpGCpG_2 0.97355 0.05311 18.331 2e-16""" | |
_methylation_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 8.22829 1.79098 4.594 4.34e-06 | |
age -0.03471 0.03276 -1.059 0.289""" | |
expected_income_mean = pd.read_table(io.StringIO(_income_estimates_mean), sep="\s+") | |
expected_income_precision = pd.read_table(io.StringIO(_income_estimates_precision), sep="\s+") | |
expected_methylation_mean = pd.read_table(io.StringIO(_methylation_estimates_mean), sep="\s+") | |
expected_methylation_precision = pd.read_table(io.StringIO(_methylation_estimates_precision), sep="\s+") | |
income = pd.read_csv('foodexpenditure.csv') | |
methylation = pd.read_csv('methylation-test.csv') | |
def check_same(a, b, eps, name): | |
assert np.allclose(a, b, rtol=0.01, atol=eps), \ | |
("different from expected", name, list(a), list(b)) | |
def test_income_coefficients(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-1], expected_income_mean['Estimate'], 1e-3, "estimates" | |
yield check_same, rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3, "p-values" | |
def test_income_precision(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
# note that we have to exp the phi results for now. | |
yield check_same, np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-1:], expected_income_precision['zvalue'], 0.1, "z-score" | |
yield check_same, rslt.pvalues[-1:], expected_income_precision['Pr(>|z|)'], 1e-3, "p-values" | |
def test_methylation_coefficients(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2, "estimates" | |
yield check_same, rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-2], expected_methylation_mean['Pr(>|z|)'], 1e-2, "p-values" | |
def test_methylation_precision(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
#yield check_same, sm.families.links.logit()(rslt.params[-2:]), expected_methylation_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-2:], expected_methylation_precision['zvalue'], 0.1, "z-score" | |
@dburnett, I can't explain the hessian error. However, since I've recently been working with beta regression, I figured yourself and others may like more info on it (Keep in mind, I am not an expert on this topic):
-
Coefficient interpretations seems both straight forward for some, and confusing for others. Here's the best resource I have found on this.
-
Some suggest beta problems can be simplified into binomial or logistic problems and handled from there (e.g. unpacking data from percentage values into successes and failures). I see problems with this, but others seem to not. In some situations, data may allow the distribution to be treated as normal. Given the right beta parameters, the distribution can be close to normal and/or transformed to the same. In my short experiences, beta parameters may differ within subsets of the predictor(s), making this process difficult.
-
Beta regression cannot handle zeroes or ones in the outcome variable. Thus, any data containing zeroes for the outcome must be removed, and obviously, imputing a very small value such as 0.000001 can create major issues. If the data contains a lot of zeroes or ones, it may be considered an inflated beta distribution. Here's a link to an article neatly describing this.
-
This code seems to handle precision models oddly on default. Different combinations of variables can be in both the logit and precision models, and will change the results. Working in r with betareg, I have derived far more reasonable estimates for known data I have. I am not 100% sure why the outcome variable is being put into the precision model, separate from the intercept, predicting itself.
Beta regression is a newer topic and most appear to try to circumvent it, so I hope this helps you or anyone else looking.
This is a very useful code. It behooves best if you send a pull request to statsmodels module.
there's a pull request (not sure if it's open or closed now) on the statsmodels repo where someone else did much more work on this.
there's a pull request (not sure if it's open or closed now) on the statsmodels repo where someone else did much more work on this.
@brentp - thank you for your work on this!
I searched open and closed PRs on the statsmodels repo for "beta regression" and couldn't find the PR you mentioned. I only found #2030 and #4238.
Do you remember if code in that PR was substantially more robust than yours? If yes, would you by any chance remember what it was about, so that I can try other search queries?
thank you for your work.
if I had generated data follow beta distribution to crate beta regression mode how do it in R?
can I used the author estmation
As it stands it doesn't seem to work out the box - throws up an error that suggests the hessian inversion fails for the first example:
C:\Python27\lib\site-packages\statsmodels\tools\numdiff.py:159: RuntimeWarning: invalid value encountered in double_scalars
f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k])
C:\Python27\lib\site-packages\statsmodels\base\model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
C:\Python27\lib\site-packages\statsmodels\base\model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
"Check mle_retvals", ConvergenceWarning)
Traceback (most recent call last):
File ".\betareg.py", line 181, in
print m.fit().summary()
File "C:\Python27\lib\site-packages\statsmodels\base\model.py", line 2095, in summary
use_t=False)
File "C:\Python27\lib\site-packages\statsmodels\iolib\summary.py", line 861, in add_table_params
use_t=use_t)
File "C:\Python27\lib\site-packages\statsmodels\iolib\summary.py", line 445, in summary_params
std_err = results.bse
File "C:\Python27\lib\site-packages\statsmodels\tools\decorators.py", line 97, in get
_cachedval = self.fget(obj)
File "C:\Python27\lib\site-packages\statsmodels\base\model.py", line 1029, in bse
return np.sqrt(np.diag(self.cov_params()))
File "C:\Python27\lib\site-packages\statsmodels\base\model.py", line 1104, in cov_params
raise ValueError('need covariance of parameters for computing '
ValueError: need covariance of parameters for computing (unnormalized) covariances
The other two examples work fine though, the foodexpenditure.csv and the methylation-test.csv inputs.
Perhaps I'm missing a step due to my unfamiliarity with Beta Regression but is there any simple way of understanding the relationship between the coefficient supplied by summary and the raw inputs? Reading around it doesn't seem there is, but given you wrote this I was hoping you might be able to shed some light.