-
-
Save yoavram/4d16b86d334925639f03da1f172cc4d0 to your computer and use it in GitHub Desktop.
beta regression in statsmodels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
u""" | |
Beta regression for modeling rates and proportions. | |
References | |
---------- | |
Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression | |
in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in | |
Economics and Statistics, 2011. | |
Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer? | |
Maximum-likelihood regression with beta-distributed dependent variables." | |
Psychological methods 11.1 (2006): 54. | |
""" | |
import numpy as np | |
import pandas as pd | |
import statsmodels.api as sm | |
from scipy.special import gammaln as lgamma | |
from statsmodels.base.model import GenericLikelihoodModel | |
from statsmodels.genmod.families import Binomial | |
# this is only need while #2024 is open. | |
class Logit(sm.families.links.Logit): | |
"""Logit tranform that won't overflow with large numbers.""" | |
def inverse(self, z): | |
return 1 / (1. + np.exp(-z)) | |
_init_example = """ | |
Beta regression with default of logit-link for exog and log-link | |
for precision. | |
>>> mod = Beta(endog, exog) | |
>>> rslt = mod.fit() | |
>>> print rslt.summary() | |
We can also specify a formula and a specific structure and use the | |
identity-link for phi. | |
>>> from sm.families.links import identity | |
>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
>>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', | |
... dat, Z=Z, link_phi=identity()) | |
In the case of proportion-data, we may think that the precision depends on | |
the number of measurements. E.g for sequence data, on the number of | |
sequence reads covering a site: | |
>>> Z = patsy.dmatrix('~ coverage', df) | |
>>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z) | |
>>> rslt = mod.fit() | |
""" | |
class Beta(GenericLikelihoodModel): | |
"""Beta Regression. | |
This implementation uses `phi` as a precision parameter equal to | |
`a + b` from the Beta parameters. | |
""" | |
def __init__(self, endog, exog, Z=None, link=Logit(), | |
link_phi=sm.families.links.Log(), **kwds): | |
""" | |
Parameters | |
---------- | |
endog : array-like | |
1d array of endogenous values (i.e. responses, outcomes, | |
dependent variables, or 'Y' values). | |
exog : array-like | |
2d array of exogeneous values (i.e. covariates, predictors, | |
independent variables, regressors, or 'X' values). A nobs x k | |
array where `nobs` is the number of observations and `k` is | |
the number of regressors. An intercept is not included by | |
default and should be added by the user. See | |
`statsmodels.tools.add_constant`. | |
Z : array-like | |
2d array of variables for the precision phi. | |
link : link | |
Any link in sm.families.links for `exog` | |
link_phi : link | |
Any link in sm.families.links for `Z` | |
Examples | |
-------- | |
{example} | |
See Also | |
-------- | |
:ref:`links` | |
""".format(example=_init_example) | |
assert np.all((0 < endog) & (endog < 1)) | |
if Z is None: | |
extra_names = ['phi'] | |
Z = np.ones((len(endog), 1), dtype='f') | |
else: | |
extra_names = ['precision-%s' % zc for zc in \ | |
(Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))] | |
kwds['extra_params_names'] = extra_names | |
super(Beta, self).__init__(endog, exog, **kwds) | |
self.link = link | |
self.link_phi = link_phi | |
self.Z = Z | |
assert len(self.Z) == len(self.endog) | |
def nloglikeobs(self, params): | |
""" | |
Negative log-likelihood. | |
Parameters | |
---------- | |
params : np.ndarray | |
Parameter estimates | |
""" | |
return -self._ll_br(self.endog, self.exog, self.Z, params) | |
def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False, | |
method='bfgs', **kwds): | |
""" | |
Fit the model. | |
Parameters | |
---------- | |
start_params : array-like | |
A vector of starting values for the regression | |
coefficients. If None, a default is chosen. | |
maxiter : integer | |
The maximum number of iterations | |
disp : bool | |
Show convergence stats. | |
method : str | |
The optimization method to use. | |
""" | |
if start_params is None: | |
start_params = sm.GLM(self.endog, self.exog, family=Binomial() | |
).fit(disp=False).params | |
start_params = np.append(start_params, [0.5] * self.Z.shape[1]) | |
return super(Beta, self).fit(start_params=start_params, | |
maxiter=maxiter, maxfun=maxfun, | |
method=method, disp=disp, **kwds) | |
def _ll_br(self, y, X, Z, params): | |
nz = self.Z.shape[1] | |
Xparams = params[:-nz] | |
Zparams = params[-nz:] | |
mu = self.link.inverse(np.dot(X, Xparams)) | |
phi = self.link_phi.inverse(np.dot(Z, Zparams)) | |
# TODO: derive a and b and constrain to > 0? | |
if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf) | |
ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \ | |
+ (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \ | |
* np.log(1 - y) | |
return ll | |
if __name__ == "__main__": | |
import patsy | |
dat = pd.read_table('gasoline.txt') | |
Z = patsy.dmatrix('~ temp', dat, return_type='dataframe') | |
# using other precison params with | |
m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat, | |
Z=Z, link_phi=sm.families.links.identity()) | |
print m.fit().summary() | |
fex = pd.read_csv('foodexpenditure.csv') | |
m = Beta.from_formula(' I(food/income) ~ income + persons', fex) | |
print m.fit().summary() | |
#print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary() | |
dev = pd.read_csv('methylation-test.csv') | |
Z = patsy.dmatrix('~ age', dev, return_type='dataframe') | |
m = Beta.from_formula('methylation ~ gender + CpG', dev, | |
Z=Z, | |
link_phi=sm.families.links.identity()) | |
print m.fit().summary() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(betareg) | |
data("GasolineYield", package = "betareg") | |
data("FoodExpenditure", package = "betareg") | |
#fe_beta = betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
#print(summary(fe_beta)$coefficients) | |
#m = betareg(yield ~ batch + temp, data = GasolineYield) | |
#print(summary(m)) | |
#gy2 <- betareg(yield ~ batch + temp | temp, data = GasolineYield, link.phi="identity") | |
#print(summary(gy2)) | |
meth = read.csv('methylation-test.csv') | |
m = betareg(methylation ~ gender + CpG | age, meth) | |
print(summary(m)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
food | income | persons | |
---|---|---|---|
15.998 | 62.476 | 1 | |
16.652 | 82.304 | 5 | |
21.741 | 74.679 | 3 | |
7.431 | 39.151 | 3 | |
10.481 | 64.724 | 5 | |
13.548 | 36.786 | 3 | |
23.256 | 83.052 | 4 | |
17.976 | 86.935 | 1 | |
14.161 | 88.233 | 2 | |
8.825 | 38.695 | 2 | |
14.184 | 73.831 | 7 | |
19.604 | 77.122 | 3 | |
13.728 | 45.519 | 2 | |
21.141 | 82.251 | 2 | |
17.446 | 59.862 | 3 | |
9.629 | 26.563 | 3 | |
14.005 | 61.818 | 2 | |
9.16 | 29.682 | 1 | |
18.831 | 50.825 | 5 | |
7.641 | 71.062 | 4 | |
13.882 | 41.99 | 4 | |
9.67 | 37.324 | 3 | |
21.604 | 86.352 | 5 | |
10.866 | 45.506 | 2 | |
28.98 | 69.929 | 6 | |
10.882 | 61.041 | 2 | |
18.561 | 82.469 | 1 | |
11.629 | 44.208 | 2 | |
18.067 | 49.467 | 5 | |
14.539 | 25.905 | 5 | |
19.192 | 79.178 | 5 | |
25.918 | 75.811 | 3 | |
28.833 | 82.718 | 6 | |
15.869 | 48.311 | 4 | |
14.91 | 42.494 | 5 | |
9.55 | 40.573 | 4 | |
23.066 | 44.872 | 6 | |
14.751 | 27.167 | 7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
iyield gravity pressure temp10 temp batch | |
0.122 50.8 8.6 190 205 1 | |
0.223 50.8 8.6 190 275 1 | |
0.347 50.8 8.6 190 345 1 | |
0.457 50.8 8.6 190 407 1 | |
0.08 40.8 3.5 210 218 2 | |
0.131 40.8 3.5 210 273 2 | |
0.266 40.8 3.5 210 347 2 | |
0.074 40 6.1 217 212 3 | |
0.182 40 6.1 217 272 3 | |
0.304 40 6.1 217 340 3 | |
0.069 38.4 6.1 220 235 4 | |
0.152 38.4 6.1 220 300 4 | |
0.26 38.4 6.1 220 365 4 | |
0.336 38.4 6.1 220 410 4 | |
0.144 40.3 4.8 231 307 5 | |
0.268 40.3 4.8 231 367 5 | |
0.349 40.3 4.8 231 395 5 | |
0.1 32.2 5.2 236 267 6 | |
0.248 32.2 5.2 236 360 6 | |
0.317 32.2 5.2 236 402 6 | |
0.028 41.3 1.8 267 235 7 | |
0.064 41.3 1.8 267 275 7 | |
0.161 41.3 1.8 267 358 7 | |
0.278 41.3 1.8 267 416 7 | |
0.05 38.1 1.2 274 285 8 | |
0.176 38.1 1.2 274 365 8 | |
0.321 38.1 1.2 274 444 8 | |
0.14 32.2 2.4 284 351 9 | |
0.232 32.2 2.4 284 424 9 | |
0.085 31.8 0.2 316 365 10 | |
0.147 31.8 0.2 316 379 10 | |
0.18 31.8 0.2 316 428 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Unnamed: 0 | gender | age | Basename | ID | id | plate | CpG | methylation | ||
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_0 | 0.815 | |
1 | 1 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_0 | 0.803 | |
2 | 2 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_0 | 0.803 | |
3 | 3 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_0 | 0.808 | |
4 | 4 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_0 | 0.8550000000000001 | |
5 | 5 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_0 | 0.8129999999999998 | |
6 | 6 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_0 | 0.816 | |
7 | 7 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_0 | 0.827 | |
8 | 8 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_0 | 0.829 | |
9 | 9 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_0 | 0.7760000000000001 | |
10 | 10 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_0 | 0.7859999999999999 | |
11 | 11 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_0 | 0.822 | |
12 | 12 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_1 | 0.891 | |
13 | 13 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_1 | 0.894 | |
14 | 14 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_1 | 0.894 | |
15 | 15 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_1 | 0.869 | |
16 | 16 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_1 | 0.914 | |
17 | 17 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_1 | 0.889 | |
18 | 18 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_1 | 0.8850000000000001 | |
19 | 19 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_1 | 0.898 | |
20 | 20 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_1 | 0.896 | |
21 | 21 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_1 | 0.86 | |
22 | 22 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_1 | 0.887 | |
23 | 23 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_1 | 0.8800000000000001 | |
24 | 24 | F | 58.231 | 6042324088_R04C01 | age58.231_F | id_0 | 6042324088 | CpG_2 | 0.936 | |
25 | 25 | M | 52.632 | 6042324088_R06C01 | age52.632_M | id_1 | 6042324088 | CpG_2 | 0.9129999999999999 | |
26 | 26 | M | 64.679 | 6042324088_R01C01 | age64.679_M | id_2 | 6042324088 | CpG_2 | 0.9000000000000001 | |
27 | 27 | F | 55.299 | 6042324088_R04C02 | age55.299_F | id_3 | 6042324088 | CpG_2 | 0.9119999999999999 | |
28 | 28 | M | 56.019 | 6042324088_R02C01 | age56.019_M | id_4 | 6042324088 | CpG_2 | 0.9349999999999999 | |
29 | 29 | M | 62.021 | 6042324088_R01C02 | age62.021_M | id_5 | 6042324088 | CpG_2 | 0.9280000000000002 | |
30 | 30 | F | 52.298 | 6042324088_R06C02 | age52.298_F | id_6 | 6042324088 | CpG_2 | 0.9150000000000001 | |
31 | 31 | F | 39.71 | 6042324088_R03C01 | age39.71_F | id_7 | 6042324088 | CpG_2 | 0.9160000000000001 | |
32 | 32 | F | 57.492 | 6042324088_R05C02 | age57.492_F | id_8 | 6042324088 | CpG_2 | 0.929 | |
33 | 33 | F | 57.623999999999995 | 6042324088_R05C01 | age57.624_F | id_9 | 6042324088 | CpG_2 | 0.92 | |
34 | 34 | F | 40.486999999999995 | 6042324088_R03C02 | age40.487_F | id_10 | 6042324088 | CpG_2 | 0.9160000000000001 | |
35 | 35 | M | 53.662 | 6042324088_R02C02 | age53.662_M | id_11 | 6042324088 | CpG_2 | 0.926 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# in R: | |
import io | |
import statsmodels.api as sm | |
import pandas as pd | |
import patsy | |
from betareg import Beta | |
import numpy as np | |
# betareg(I(food/income) ~ income + persons, data = FoodExpenditure) | |
_income_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03 | |
income -0.01229884 0.003035585 -4.051556 5.087819e-05 | |
persons 0.11846210 0.035340667 3.352005 8.022853e-04""" | |
_income_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(phi) 35.60975 8.079598 4.407366 1.046351e-05 | |
""" | |
_methylation_estimates_mean = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 1.44224 0.03401 42.404 2e-16 | |
genderM 0.06986 0.04359 1.603 0.109 | |
CpGCpG_1 0.60735 0.04834 12.563 2e-16 | |
CpGCpG_2 0.97355 0.05311 18.331 2e-16""" | |
_methylation_estimates_precision = u"""\ | |
varname Estimate StdError zvalue Pr(>|z|) | |
(Intercept) 8.22829 1.79098 4.594 4.34e-06 | |
age -0.03471 0.03276 -1.059 0.289""" | |
expected_income_mean = pd.read_table(io.StringIO(_income_estimates_mean), sep="\s+") | |
expected_income_precision = pd.read_table(io.StringIO(_income_estimates_precision), sep="\s+") | |
expected_methylation_mean = pd.read_table(io.StringIO(_methylation_estimates_mean), sep="\s+") | |
expected_methylation_precision = pd.read_table(io.StringIO(_methylation_estimates_precision), sep="\s+") | |
income = pd.read_csv('foodexpenditure.csv') | |
methylation = pd.read_csv('methylation-test.csv') | |
def check_same(a, b, eps, name): | |
assert np.allclose(a, b, rtol=0.01, atol=eps), \ | |
("different from expected", name, list(a), list(b)) | |
def test_income_coefficients(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-1], expected_income_mean['Estimate'], 1e-3, "estimates" | |
yield check_same, rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3, "p-values" | |
def test_income_precision(): | |
model = "I(food/income) ~ income + persons" | |
mod = Beta.from_formula(model, income) | |
rslt = mod.fit() | |
# note that we have to exp the phi results for now. | |
yield check_same, np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-1:], expected_income_precision['zvalue'], 0.1, "z-score" | |
yield check_same, rslt.pvalues[-1:], expected_income_precision['Pr(>|z|)'], 1e-3, "p-values" | |
def test_methylation_coefficients(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
yield check_same, rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2, "estimates" | |
yield check_same, rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1, "z-scores" | |
yield check_same, rslt.pvalues[:-2], expected_methylation_mean['Pr(>|z|)'], 1e-2, "p-values" | |
def test_methylation_precision(): | |
model = "methylation ~ gender + CpG" | |
Z = patsy.dmatrix("~ age", methylation) | |
mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity()) | |
rslt = mod.fit() | |
#yield check_same, sm.families.links.logit()(rslt.params[-2:]), expected_methylation_precision['Estimate'], 1e-3, "estimate" | |
#yield check_same, rslt.tvalues[-2:], expected_methylation_precision['zvalue'], 0.1, "z-score" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment