brentp · September 17, 2024 14:48 · dburdett · Apr 13, 2018 · saccosj · Jun 16, 2018
diff --git a/betareg.py b/betareg.py
 # -*- coding: utf-8 -*-

 u"""
 Beta regression for modeling rates and proportions.

 References
 ----------
 Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
 in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
 Economics and Statistics, 2011.

 Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
 Maximum-likelihood regression with beta-distributed dependent variables."
 Psychological methods 11.1 (2006): 54.
 """
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm

 from scipy.special import gammaln as lgamma

 from statsmodels.base.model import GenericLikelihoodModel
 from statsmodels.genmod.families import Binomial

 # this is only need while #2024 is open.
 class Logit(sm.families.links.Logit):

    """Logit tranform that won't overflow with large numbers."""

    def inverse(self, z):
        return 1 / (1. + np.exp(-z))

 _init_example = """

    Beta regression with default of logit-link for exog and log-link
    for precision.

    >>> mod = Beta(endog, exog)
    >>> rslt = mod.fit()
    >>> print rslt.summary()

    We can also specify a formula and a specific structure and use the
    identity-link for phi.

    >>> from sm.families.links import identity
    >>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
    >>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
    ...                         dat, Z=Z, link_phi=identity())

    In the case of proportion-data, we may think that the precision depends on
    the number of measurements. E.g for sequence data, on the number of
    sequence reads covering a site:

    >>> Z = patsy.dmatrix('~ coverage', df)
    >>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z)
    >>> rslt = mod.fit()

 """

 class Beta(GenericLikelihoodModel):

    """Beta Regression.

    This implementation uses `phi` as a precision parameter equal to
    `a + b` from the Beta parameters.
    """

    def __init__(self, endog, exog, Z=None, link=Logit(),
            link_phi=sm.families.links.Log(), **kwds):
        """
        Parameters
        ----------
        endog : array-like
            1d array of endogenous values (i.e. responses, outcomes,
            dependent variables, or 'Y' values).
        exog : array-like
            2d array of exogeneous values (i.e. covariates, predictors,
            independent variables, regressors, or 'X' values). A nobs x k
            array where `nobs` is the number of observations and `k` is
            the number of regressors. An intercept is not included by
            default and should be added by the user. See
            `statsmodels.tools.add_constant`.
        Z : array-like
            2d array of variables for the precision phi.
        link : link
            Any link in sm.families.links for `exog`
        link_phi : link
            Any link in sm.families.links for `Z`

        Examples
        --------
        {example}

        See Also
        --------
        :ref:`links`

        """.format(example=_init_example)
        assert np.all((0 < endog) & (endog < 1))
        if Z is None:
            extra_names = ['phi']
            Z = np.ones((len(endog), 1), dtype='f')
        else:
            extra_names = ['precision-%s' % zc for zc in \
                        (Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))]
        kwds['extra_params_names'] = extra_names

        super(Beta, self).__init__(endog, exog, **kwds)
        self.link = link
        self.link_phi = link_phi
        
        self.Z = Z
        assert len(self.Z) == len(self.endog)

    def nloglikeobs(self, params):
        """
        Negative log-likelihood.

        Parameters
        ----------

        params : np.ndarray
            Parameter estimates
        """
        return -self._ll_br(self.endog, self.exog, self.Z, params)

    def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False,
            method='bfgs', **kwds):
        """
        Fit the model.

        Parameters
        ----------
        start_params : array-like
            A vector of starting values for the regression
            coefficients.  If None, a default is chosen.
        maxiter : integer
            The maximum number of iterations
        disp : bool
            Show convergence stats.
        method : str
            The optimization method to use.
        """

        if start_params is None:
            start_params = sm.GLM(self.endog, self.exog, family=Binomial()
                                 ).fit(disp=False).params
            start_params = np.append(start_params, [0.5] * self.Z.shape[1])

        return super(Beta, self).fit(start_params=start_params,
                                        maxiter=maxiter, maxfun=maxfun,
                                        method=method, disp=disp, **kwds)

    def _ll_br(self, y, X, Z, params):
        nz = self.Z.shape[1]

        Xparams = params[:-nz]
        Zparams = params[-nz:]

        mu = self.link.inverse(np.dot(X, Xparams))
        phi = self.link_phi.inverse(np.dot(Z, Zparams))
        # TODO: derive a and b and constrain to > 0?

        if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf)

        ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \
                + (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \
                * np.log(1 - y)

        return ll

 if __name__ == "__main__":

    import patsy
    dat = pd.read_table('gasoline.txt')
    Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
    # using other precison params with
    m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat,
            Z=Z, link_phi=sm.families.links.identity())
    print m.fit().summary()

    fex = pd.read_csv('foodexpenditure.csv')
    m = Beta.from_formula(' I(food/income) ~ income + persons', fex)
    print m.fit().summary()
    #print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary()

    dev = pd.read_csv('methylation-test.csv')
    Z = patsy.dmatrix('~ age', dev, return_type='dataframe')
    m = Beta.from_formula('methylation ~ gender + CpG', dev,
            Z=Z,
            link_phi=sm.families.links.identity())
    print m.fit().summary()
diff --git a/ex.R b/ex.R
 library(betareg)

 data("GasolineYield", package = "betareg")
 data("FoodExpenditure", package = "betareg")

 #fe_beta = betareg(I(food/income) ~ income + persons, data = FoodExpenditure)
 #print(summary(fe_beta)$coefficients)


 #m = betareg(yield ~ batch + temp, data = GasolineYield)
 #print(summary(m))



 #gy2 <- betareg(yield ~ batch + temp | temp, data = GasolineYield, link.phi="identity")
 #print(summary(gy2))


 meth = read.csv('methylation-test.csv')
 m =  betareg(methylation ~ gender + CpG | age, meth)
 print(summary(m))

diff --git a/foodexpenditure.csv b/foodexpenditure.csv
diff --git a/gasoline.txt b/gasoline.txt
 iyield	gravity	pressure	temp10	temp	batch
 0.122	50.8	8.6	190	205	1
 0.223	50.8	8.6	190	275	1
 0.347	50.8	8.6	190	345	1
 0.457	50.8	8.6	190	407	1
 0.08	40.8	3.5	210	218	2
 0.131	40.8	3.5	210	273	2
 0.266	40.8	3.5	210	347	2
 0.074	40	6.1	217	212	3
 0.182	40	6.1	217	272	3
 0.304	40	6.1	217	340	3
 0.069	38.4	6.1	220	235	4
 0.152	38.4	6.1	220	300	4
 0.26	38.4	6.1	220	365	4
 0.336	38.4	6.1	220	410	4
 0.144	40.3	4.8	231	307	5
 0.268	40.3	4.8	231	367	5
 0.349	40.3	4.8	231	395	5
 0.1	32.2	5.2	236	267	6
 0.248	32.2	5.2	236	360	6
 0.317	32.2	5.2	236	402	6
 0.028	41.3	1.8	267	235	7
 0.064	41.3	1.8	267	275	7
 0.161	41.3	1.8	267	358	7
 0.278	41.3	1.8	267	416	7
 0.05	38.1	1.2	274	285	8
 0.176	38.1	1.2	274	365	8
 0.321	38.1	1.2	274	444	8
 0.14	32.2	2.4	284	351	9
 0.232	32.2	2.4	284	424	9
 0.085	31.8	0.2	316	365	10
 0.147	31.8	0.2	316	379	10
 0.18	31.8	0.2	316	428	10
diff --git a/methylation-test.csv b/methylation-test.csv
diff --git a/test_beta.py b/test_beta.py

 # in R:
 import io
 import statsmodels.api as sm
 import pandas as pd
 import patsy
 from betareg import Beta
 import numpy as np

 # betareg(I(food/income) ~ income + persons, data = FoodExpenditure)
 _income_estimates_mean = u"""\
 varname        Estimate  StdError   zvalue     Pr(>|z|)
 (Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03
 income      -0.01229884 0.003035585 -4.051556 5.087819e-05
 persons      0.11846210 0.035340667  3.352005 8.022853e-04"""

 _income_estimates_precision = u"""\
 varname  Estimate StdError  zvalue     Pr(>|z|)
 (phi) 35.60975   8.079598 4.407366 1.046351e-05
 """

 _methylation_estimates_mean = u"""\
 varname      Estimate StdError zvalue Pr(>|z|)    
 (Intercept)  1.44224    0.03401  42.404   2e-16
 genderM      0.06986    0.04359   1.603    0.109    
 CpGCpG_1     0.60735    0.04834  12.563   2e-16
 CpGCpG_2     0.97355    0.05311  18.331   2e-16"""

 _methylation_estimates_precision = u"""\
 varname Estimate StdError zvalue Pr(>|z|)    
 (Intercept)  8.22829    1.79098   4.594 4.34e-06
 age         -0.03471    0.03276  -1.059    0.289"""


 expected_income_mean = pd.read_table(io.StringIO(_income_estimates_mean), sep="\s+")
 expected_income_precision = pd.read_table(io.StringIO(_income_estimates_precision), sep="\s+")

 expected_methylation_mean = pd.read_table(io.StringIO(_methylation_estimates_mean), sep="\s+")
 expected_methylation_precision = pd.read_table(io.StringIO(_methylation_estimates_precision), sep="\s+")

 income = pd.read_csv('foodexpenditure.csv')
 methylation = pd.read_csv('methylation-test.csv')

 def check_same(a, b, eps, name):
    assert np.allclose(a, b, rtol=0.01, atol=eps), \
            ("different from expected", name, list(a), list(b))


 def test_income_coefficients():
    model = "I(food/income) ~ income + persons"

    mod = Beta.from_formula(model, income)
    rslt = mod.fit()
    yield check_same, rslt.params[:-1], expected_income_mean['Estimate'], 1e-3, "estimates"
    yield check_same, rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1, "z-scores"
    yield check_same, rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3, "p-values"


 def test_income_precision():
    model = "I(food/income) ~ income + persons"

    mod = Beta.from_formula(model, income)
    rslt = mod.fit()
    # note that we have to exp the phi results for now.
    yield check_same, np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3, "estimate"
    #yield check_same, rslt.tvalues[-1:], expected_income_precision['zvalue'], 0.1, "z-score"
    yield check_same, rslt.pvalues[-1:], expected_income_precision['Pr(>|z|)'], 1e-3, "p-values"


 def test_methylation_coefficients():
    model = "methylation ~ gender + CpG"
    Z = patsy.dmatrix("~ age", methylation)

    mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity())
    rslt = mod.fit()
    yield check_same, rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2, "estimates"
    yield check_same, rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1, "z-scores"
    yield check_same, rslt.pvalues[:-2], expected_methylation_mean['Pr(>|z|)'], 1e-2, "p-values"

 def test_methylation_precision():
    model = "methylation ~ gender + CpG"
    Z = patsy.dmatrix("~ age", methylation)

    mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity())
    rslt = mod.fit()
    #yield check_same, sm.families.links.logit()(rslt.params[-2:]), expected_methylation_precision['Estimate'], 1e-3, "estimate"
    #yield check_same, rslt.tvalues[-2:], expected_methylation_precision['zvalue'], 0.1, "z-score"
	# -- coding: utf-8 --

	u"""
	Beta regression for modeling rates and proportions.

	References
	----------
	Grün, Bettina, Ioannis Kosmidis, and Achim Zeileis. Extended beta regression
	in R: Shaken, stirred, mixed, and partitioned. No. 2011-22. Working Papers in
	Economics and Statistics, 2011.

	Smithson, Michael, and Jay Verkuilen. "A better lemon squeezer?
	Maximum-likelihood regression with beta-distributed dependent variables."
	Psychological methods 11.1 (2006): 54.
	"""
	import numpy as np
	import pandas as pd
	import statsmodels.api as sm

	from scipy.special import gammaln as lgamma

	from statsmodels.base.model import GenericLikelihoodModel
	from statsmodels.genmod.families import Binomial

	# this is only need while #2024 is open.
	class Logit(sm.families.links.Logit):

	"""Logit tranform that won't overflow with large numbers."""

	def inverse(self, z):
	return 1 / (1. + np.exp(-z))

	_init_example = """

	Beta regression with default of logit-link for exog and log-link
	for precision.

	>>> mod = Beta(endog, exog)
	>>> rslt = mod.fit()
	>>> print rslt.summary()

	We can also specify a formula and a specific structure and use the
	identity-link for phi.

	>>> from sm.families.links import identity
	>>> Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
	>>> mod = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp',
	... dat, Z=Z, link_phi=identity())

	In the case of proportion-data, we may think that the precision depends on
	the number of measurements. E.g for sequence data, on the number of
	sequence reads covering a site:

	>>> Z = patsy.dmatrix('~ coverage', df)
	>>> mod = Beta.from_formula('methylation ~ disease + age + gender + coverage', df, Z)
	>>> rslt = mod.fit()

	"""

	class Beta(GenericLikelihoodModel):

	"""Beta Regression.

	This implementation uses `phi` as a precision parameter equal to
	`a + b` from the Beta parameters.
	"""

	def __init__(self, endog, exog, Z=None, link=Logit(),
	link_phi=sm.families.links.Log(), **kwds):
	"""
	Parameters
	----------
	endog : array-like
	1d array of endogenous values (i.e. responses, outcomes,
	dependent variables, or 'Y' values).
	exog : array-like
	2d array of exogeneous values (i.e. covariates, predictors,
	independent variables, regressors, or 'X' values). A nobs x k
	array where `nobs` is the number of observations and `k` is
	the number of regressors. An intercept is not included by
	default and should be added by the user. See
	`statsmodels.tools.add_constant`.
	Z : array-like
	2d array of variables for the precision phi.
	link : link
	Any link in sm.families.links for `exog`
	link_phi : link
	Any link in sm.families.links for `Z`

	Examples
	--------
	{example}

	See Also
	--------
	:ref:`links`

	""".format(example=_init_example)
	assert np.all((0 < endog) & (endog < 1))
	if Z is None:
	extra_names = ['phi']
	Z = np.ones((len(endog), 1), dtype='f')
	else:
	extra_names = ['precision-%s' % zc for zc in \
	(Z.columns if hasattr(Z, 'columns') else range(1, Z.shape[1] + 1))]
	kwds['extra_params_names'] = extra_names

	super(Beta, self).__init__(endog, exog, **kwds)
	self.link = link
	self.link_phi = link_phi

	self.Z = Z
	assert len(self.Z) == len(self.endog)

	def nloglikeobs(self, params):
	"""
	Negative log-likelihood.

	Parameters
	----------

	params : np.ndarray
	Parameter estimates
	"""
	return -self._ll_br(self.endog, self.exog, self.Z, params)

	def fit(self, start_params=None, maxiter=100000, maxfun=5000, disp=False,
	method='bfgs', **kwds):
	"""
	Fit the model.

	Parameters
	----------
	start_params : array-like
	A vector of starting values for the regression
	coefficients. If None, a default is chosen.
	maxiter : integer
	The maximum number of iterations
	disp : bool
	Show convergence stats.
	method : str
	The optimization method to use.
	"""

	if start_params is None:
	start_params = sm.GLM(self.endog, self.exog, family=Binomial()
	).fit(disp=False).params
	start_params = np.append(start_params, [0.5] * self.Z.shape[1])

	return super(Beta, self).fit(start_params=start_params,
	maxiter=maxiter, maxfun=maxfun,
	method=method, disp=disp, **kwds)

	def _ll_br(self, y, X, Z, params):
	nz = self.Z.shape[1]

	Xparams = params[:-nz]
	Zparams = params[-nz:]

	mu = self.link.inverse(np.dot(X, Xparams))
	phi = self.link_phi.inverse(np.dot(Z, Zparams))
	# TODO: derive a and b and constrain to > 0?

	if np.any(phi <= np.finfo(float).eps): return np.array(-np.inf)

	ll = lgamma(phi) - lgamma(mu * phi) - lgamma((1 - mu) * phi) \
	+ (mu * phi - 1) * np.log(y) + (((1 - mu) * phi) - 1) \
	* np.log(1 - y)

	return ll

	if __name__ == "__main__":

	import patsy
	dat = pd.read_table('gasoline.txt')
	Z = patsy.dmatrix('~ temp', dat, return_type='dataframe')
	# using other precison params with
	m = Beta.from_formula('iyield ~ C(batch, Treatment(10)) + temp', dat,
	Z=Z, link_phi=sm.families.links.identity())
	print m.fit().summary()

	fex = pd.read_csv('foodexpenditure.csv')
	m = Beta.from_formula(' I(food/income) ~ income + persons', fex)
	print m.fit().summary()
	#print GLM.from_formula('iyield ~ C(batch) + temp', dat, family=Binomial()).fit().summary()

	dev = pd.read_csv('methylation-test.csv')
	Z = patsy.dmatrix('~ age', dev, return_type='dataframe')
	m = Beta.from_formula('methylation ~ gender + CpG', dev,
	Z=Z,
	link_phi=sm.families.links.identity())
	print m.fit().summary()
	library(betareg)

	data("GasolineYield", package = "betareg")
	data("FoodExpenditure", package = "betareg")

	#fe_beta = betareg(I(food/income) ~ income + persons, data = FoodExpenditure)
	#print(summary(fe_beta)$coefficients)


	#m = betareg(yield ~ batch + temp, data = GasolineYield)
	#print(summary(m))



	#gy2 <- betareg(yield ~ batch + temp \| temp, data = GasolineYield, link.phi="identity")
	#print(summary(gy2))


	meth = read.csv('methylation-test.csv')
	m = betareg(methylation ~ gender + CpG \| age, meth)
	print(summary(m))
food	income	persons
15.998	62.476	1
16.652	82.304	5
21.741	74.679	3
7.431	39.151	3
10.481	64.724	5
13.548	36.786	3
23.256	83.052	4
17.976	86.935	1
14.161	88.233	2
8.825	38.695	2
14.184	73.831	7
19.604	77.122	3
13.728	45.519	2
21.141	82.251	2
17.446	59.862	3
9.629	26.563	3
14.005	61.818	2
9.16	29.682	1
18.831	50.825	5
7.641	71.062	4
13.882	41.99	4
9.67	37.324	3
21.604	86.352	5
10.866	45.506	2
28.98	69.929	6
10.882	61.041	2
18.561	82.469	1
11.629	44.208	2
18.067	49.467	5
14.539	25.905	5
19.192	79.178	5
25.918	75.811	3
28.833	82.718	6
15.869	48.311	4
14.91	42.494	5
9.55	40.573	4
23.066	44.872	6
14.751	27.167	7
	iyield gravity pressure temp10 temp batch
	0.122 50.8 8.6 190 205 1
	0.223 50.8 8.6 190 275 1
	0.347 50.8 8.6 190 345 1
	0.457 50.8 8.6 190 407 1
	0.08 40.8 3.5 210 218 2
	0.131 40.8 3.5 210 273 2
	0.266 40.8 3.5 210 347 2
	0.074 40 6.1 217 212 3
	0.182 40 6.1 217 272 3
	0.304 40 6.1 217 340 3
	0.069 38.4 6.1 220 235 4
	0.152 38.4 6.1 220 300 4
	0.26 38.4 6.1 220 365 4
	0.336 38.4 6.1 220 410 4
	0.144 40.3 4.8 231 307 5
	0.268 40.3 4.8 231 367 5
	0.349 40.3 4.8 231 395 5
	0.1 32.2 5.2 236 267 6
	0.248 32.2 5.2 236 360 6
	0.317 32.2 5.2 236 402 6
	0.028 41.3 1.8 267 235 7
	0.064 41.3 1.8 267 275 7
	0.161 41.3 1.8 267 358 7
	0.278 41.3 1.8 267 416 7
	0.05 38.1 1.2 274 285 8
	0.176 38.1 1.2 274 365 8
	0.321 38.1 1.2 274 444 8
	0.14 32.2 2.4 284 351 9
	0.232 32.2 2.4 284 424 9
	0.085 31.8 0.2 316 365 10
	0.147 31.8 0.2 316 379 10
	0.18 31.8 0.2 316 428 10
	Unnamed: 0	gender	age	Basename	ID	id	plate	CpG	methylation
0	0	F	58.231	6042324088_R04C01	age58.231_F	id_0	6042324088	CpG_0	0.815
1	1	M	52.632	6042324088_R06C01	age52.632_M	id_1	6042324088	CpG_0	0.803
2	2	M	64.679	6042324088_R01C01	age64.679_M	id_2	6042324088	CpG_0	0.803
3	3	F	55.299	6042324088_R04C02	age55.299_F	id_3	6042324088	CpG_0	0.808
4	4	M	56.019	6042324088_R02C01	age56.019_M	id_4	6042324088	CpG_0	0.8550000000000001
5	5	M	62.021	6042324088_R01C02	age62.021_M	id_5	6042324088	CpG_0	0.8129999999999998
6	6	F	52.298	6042324088_R06C02	age52.298_F	id_6	6042324088	CpG_0	0.816
7	7	F	39.71	6042324088_R03C01	age39.71_F	id_7	6042324088	CpG_0	0.827
8	8	F	57.492	6042324088_R05C02	age57.492_F	id_8	6042324088	CpG_0	0.829
9	9	F	57.623999999999995	6042324088_R05C01	age57.624_F	id_9	6042324088	CpG_0	0.7760000000000001
10	10	F	40.486999999999995	6042324088_R03C02	age40.487_F	id_10	6042324088	CpG_0	0.7859999999999999
11	11	M	53.662	6042324088_R02C02	age53.662_M	id_11	6042324088	CpG_0	0.822
12	12	F	58.231	6042324088_R04C01	age58.231_F	id_0	6042324088	CpG_1	0.891
13	13	M	52.632	6042324088_R06C01	age52.632_M	id_1	6042324088	CpG_1	0.894
14	14	M	64.679	6042324088_R01C01	age64.679_M	id_2	6042324088	CpG_1	0.894
15	15	F	55.299	6042324088_R04C02	age55.299_F	id_3	6042324088	CpG_1	0.869
16	16	M	56.019	6042324088_R02C01	age56.019_M	id_4	6042324088	CpG_1	0.914
17	17	M	62.021	6042324088_R01C02	age62.021_M	id_5	6042324088	CpG_1	0.889
18	18	F	52.298	6042324088_R06C02	age52.298_F	id_6	6042324088	CpG_1	0.8850000000000001
19	19	F	39.71	6042324088_R03C01	age39.71_F	id_7	6042324088	CpG_1	0.898
20	20	F	57.492	6042324088_R05C02	age57.492_F	id_8	6042324088	CpG_1	0.896
21	21	F	57.623999999999995	6042324088_R05C01	age57.624_F	id_9	6042324088	CpG_1	0.86
22	22	F	40.486999999999995	6042324088_R03C02	age40.487_F	id_10	6042324088	CpG_1	0.887
23	23	M	53.662	6042324088_R02C02	age53.662_M	id_11	6042324088	CpG_1	0.8800000000000001
24	24	F	58.231	6042324088_R04C01	age58.231_F	id_0	6042324088	CpG_2	0.936
25	25	M	52.632	6042324088_R06C01	age52.632_M	id_1	6042324088	CpG_2	0.9129999999999999
26	26	M	64.679	6042324088_R01C01	age64.679_M	id_2	6042324088	CpG_2	0.9000000000000001
27	27	F	55.299	6042324088_R04C02	age55.299_F	id_3	6042324088	CpG_2	0.9119999999999999
28	28	M	56.019	6042324088_R02C01	age56.019_M	id_4	6042324088	CpG_2	0.9349999999999999
29	29	M	62.021	6042324088_R01C02	age62.021_M	id_5	6042324088	CpG_2	0.9280000000000002
30	30	F	52.298	6042324088_R06C02	age52.298_F	id_6	6042324088	CpG_2	0.9150000000000001
31	31	F	39.71	6042324088_R03C01	age39.71_F	id_7	6042324088	CpG_2	0.9160000000000001
32	32	F	57.492	6042324088_R05C02	age57.492_F	id_8	6042324088	CpG_2	0.929
33	33	F	57.623999999999995	6042324088_R05C01	age57.624_F	id_9	6042324088	CpG_2	0.92
34	34	F	40.486999999999995	6042324088_R03C02	age40.487_F	id_10	6042324088	CpG_2	0.9160000000000001
35	35	M	53.662	6042324088_R02C02	age53.662_M	id_11	6042324088	CpG_2	0.926

	# in R:
	import io
	import statsmodels.api as sm
	import pandas as pd
	import patsy
	from betareg import Beta
	import numpy as np

	# betareg(I(food/income) ~ income + persons, data = FoodExpenditure)
	_income_estimates_mean = u"""\
	varname Estimate StdError zvalue Pr(>\|z\|)
	(Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03
	income -0.01229884 0.003035585 -4.051556 5.087819e-05
	persons 0.11846210 0.035340667 3.352005 8.022853e-04"""

	_income_estimates_precision = u"""\
	varname Estimate StdError zvalue Pr(>\|z\|)
	(phi) 35.60975 8.079598 4.407366 1.046351e-05
	"""

	_methylation_estimates_mean = u"""\
	varname Estimate StdError zvalue Pr(>\|z\|)
	(Intercept) 1.44224 0.03401 42.404 2e-16
	genderM 0.06986 0.04359 1.603 0.109
	CpGCpG_1 0.60735 0.04834 12.563 2e-16
	CpGCpG_2 0.97355 0.05311 18.331 2e-16"""

	_methylation_estimates_precision = u"""\
	varname Estimate StdError zvalue Pr(>\|z\|)
	(Intercept) 8.22829 1.79098 4.594 4.34e-06
	age -0.03471 0.03276 -1.059 0.289"""


	expected_income_mean = pd.read_table(io.StringIO(_income_estimates_mean), sep="\s+")
	expected_income_precision = pd.read_table(io.StringIO(_income_estimates_precision), sep="\s+")

	expected_methylation_mean = pd.read_table(io.StringIO(_methylation_estimates_mean), sep="\s+")
	expected_methylation_precision = pd.read_table(io.StringIO(_methylation_estimates_precision), sep="\s+")

	income = pd.read_csv('foodexpenditure.csv')
	methylation = pd.read_csv('methylation-test.csv')

	def check_same(a, b, eps, name):
	assert np.allclose(a, b, rtol=0.01, atol=eps), \
	("different from expected", name, list(a), list(b))


	def test_income_coefficients():
	model = "I(food/income) ~ income + persons"

	mod = Beta.from_formula(model, income)
	rslt = mod.fit()
	yield check_same, rslt.params[:-1], expected_income_mean['Estimate'], 1e-3, "estimates"
	yield check_same, rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1, "z-scores"
	yield check_same, rslt.pvalues[:-1], expected_income_mean['Pr(>\|z\|)'], 1e-3, "p-values"


	def test_income_precision():
	model = "I(food/income) ~ income + persons"

	mod = Beta.from_formula(model, income)
	rslt = mod.fit()
	# note that we have to exp the phi results for now.
	yield check_same, np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3, "estimate"
	#yield check_same, rslt.tvalues[-1:], expected_income_precision['zvalue'], 0.1, "z-score"
	yield check_same, rslt.pvalues[-1:], expected_income_precision['Pr(>\|z\|)'], 1e-3, "p-values"


	def test_methylation_coefficients():
	model = "methylation ~ gender + CpG"
	Z = patsy.dmatrix("~ age", methylation)

	mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity())
	rslt = mod.fit()
	yield check_same, rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2, "estimates"
	yield check_same, rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1, "z-scores"
	yield check_same, rslt.pvalues[:-2], expected_methylation_mean['Pr(>\|z\|)'], 1e-2, "p-values"

	def test_methylation_precision():
	model = "methylation ~ gender + CpG"
	Z = patsy.dmatrix("~ age", methylation)

	mod = Beta.from_formula(model, methylation, Z=Z, link_phi=sm.families.links.identity())
	rslt = mod.fit()
	#yield check_same, sm.families.links.logit()(rslt.params[-2:]), expected_methylation_precision['Estimate'], 1e-3, "estimate"
	#yield check_same, rslt.tvalues[-2:], expected_methylation_precision['zvalue'], 0.1, "z-score"