Created
August 26, 2012 23:19
-
-
Save vincentarelbundock/3484274 to your computer and use it in GitHub Desktop.
Statsmodels example: Discrete data models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "example_discrete" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Discrete Data Models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"import statsmodels.api as sm" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Data\n", | |
"\n", | |
"Load data from Spector and Mazzeo (1980). Examples follow Greene's Econometric Analysis Ch. 21 (5th Edition)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"spector_data = sm.datasets.spector.load()\n", | |
"spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Inspect the data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print spector_data.exog[:5,:]\n", | |
"print spector_data.endog[:5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[ 2.66 20. 0. 1. ]\n", | |
" [ 2.89 22. 0. 1. ]\n", | |
" [ 3.28 24. 0. 1. ]\n", | |
" [ 2.92 12. 0. 1. ]\n", | |
" [ 4. 21. 0. 1. ]]\n", | |
"[ 0. 0. 0. 0. 1.]\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Linear Probability Model (OLS)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)\n", | |
"lpm_res = lpm_mod.fit()\n", | |
"print 'Parameters: ', lpm_res.params[:-1]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Parameters: [ 0.46385168 0.01049512 0.37855479]\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Logit Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"logit_mod = sm.Logit(spector_data.endog, spector_data.exog)\n", | |
"logit_res = logit_mod.fit()\n", | |
"print 'Parameters: ', logit_res.params\n", | |
"print 'Marginal effects: ', logit_res.margeff()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Optimization terminated successfully.\n", | |
" Current function value: 12.889634\n", | |
" Iterations 7\n", | |
"Parameters: [ 2.82611259 0.09515766 2.37868766 -13.02134686]\n", | |
"Marginal effects: [ 0.36258083 0.01220841 0.3051777 ]\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As in all the discrete data models presented below, we can print a nice summary of results:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print logit_res.summary()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Logit Regression Results \n", | |
"==============================================================================\n", | |
"Dep. Variable: y No. Observations: 32\n", | |
"Model: Logit Df Residuals: 28\n", | |
"Method: MLE Df Model: 3\n", | |
"Date: Sun, 26 Aug 2012 Pseudo R-squ.: 0.3740\n", | |
"Time: 20:48:40 Log-Likelihood: -12.890\n", | |
"converged: True LL-Null: -20.592\n", | |
" LLR p-value: 0.001502\n", | |
"==============================================================================\n", | |
" coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 2.8261 1.263 2.238 0.025 0.351 5.301\n", | |
"x2 0.0952 0.142 0.672 0.501 -0.182 0.373\n", | |
"x3 2.3787 1.065 2.234 0.025 0.292 4.465\n", | |
"const -13.0213 4.931 -2.641 0.008 -22.687 -3.356\n", | |
"==============================================================================\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Probit Model " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"probit_mod = sm.Probit(spector_data.endog, spector_data.exog)\n", | |
"probit_res = probit_mod.fit()\n", | |
"print 'Parameters: ', probit_res.params\n", | |
"print 'Marginal effects: ', probit_res.margeff()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Optimization terminated successfully.\n", | |
" Current function value: 12.818804\n", | |
" Iterations 6\n", | |
"Parameters: [ 1.62581004 0.05172895 1.42633234 -7.45231965]\n", | |
"Marginal effects: [ 0.36078629 0.01147926 0.31651986]\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Multinomial Logit" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Load data from the American National Election Studies:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"anes_data = sm.datasets.anes96.load()\n", | |
"anes_exog = anes_data.exog\n", | |
"anes_exog[:,0] = np.log(anes_exog[:,0] + .1)\n", | |
"anes_exog = np.column_stack((anes_exog[:,0],anes_exog[:,2],anes_exog[:,5:8]))\n", | |
"anes_exog = sm.add_constant(anes_exog, prepend=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Inspect the data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print anes_data.exog[:5,:]\n", | |
"print anes_data.endog[:5]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[ -2.30258509 7. 7. 1. 6. 36.\n", | |
" 3. 1. 1. ]\n", | |
" [ 5.24755025 1. 3. 3. 5. 20.\n", | |
" 4. 1. 0. ]\n", | |
" [ 3.43720782 7. 2. 2. 6. 24.\n", | |
" 6. 1. 0. ]\n", | |
" [ 4.4200447 4. 3. 4. 5. 28.\n", | |
" 6. 1. 0. ]\n", | |
" [ 6.46162441 7. 5. 6. 4. 68.\n", | |
" 6. 1. 0. ]]\n", | |
"[ 6. 1. 1. 1. 0.]\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Fit MNL model:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)\n", | |
"mlogit_res = mlogit_mod.fit()\n", | |
"print mlogit_res.params" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Optimization terminated successfully.\n", | |
" Current function value: 1461.922747\n", | |
" Iterations 7\n", | |
"[[ -0.01153597 -0.08875065 -0.1059667 -0.0915567 -0.0932846\n", | |
" -0.14088069]\n", | |
" [ 0.29771435 0.39166864 0.57345051 1.27877179 1.34696165\n", | |
" 2.07008014]\n", | |
" [ -0.024945 -0.02289784 -0.01485121 -0.00868135 -0.01790407\n", | |
" -0.00943265]\n", | |
" [ 0.08249144 0.18104276 -0.00715242 0.19982796 0.21693885\n", | |
" 0.3219257 ]\n", | |
" [ 0.00519655 0.04787398 0.05757516 0.08449838 0.08095841\n", | |
" 0.10889408]\n", | |
" [ -0.37340168 -2.25091318 -3.66558353 -7.61384309 -7.06047825\n", | |
" -12.1057509 ]]\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Poisson\n", | |
"\n", | |
"Load the Rand data. Note that this example is similar to Cameron and Trivedi's `Microeconometrics` Table 20.5, but it is slightly different because of minor changes in the data. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"rand_data = sm.datasets.randhie.load()\n", | |
"rand_exog = rand_data.exog.view(float).reshape(len(rand_data.exog), -1)\n", | |
"rand_exog = sm.add_constant(rand_exog, prepend=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Fit Poisson model: " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"poisson_mod = sm.Poisson(rand_data.endog, rand_exog)\n", | |
"poisson_res = poisson_mod.fit(method=\"newton\")\n", | |
"print poisson_res.summary()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Optimization terminated successfully.\n", | |
" Current function value: 62419.588564\n", | |
" Iterations 12\n", | |
" Poisson Regression Results \n", | |
"==============================================================================\n", | |
"Dep. Variable: y No. Observations: 20190\n", | |
"Model: Poisson Df Residuals: 20180\n", | |
"Method: MLE Df Model: 9\n", | |
"Date: Sun, 26 Aug 2012 Pseudo R-squ.: 0.06343\n", | |
"Time: 20:48:46 Log-Likelihood: -62420.\n", | |
"converged: True LL-Null: -66647.\n", | |
" LLR p-value: 0.000\n", | |
"==============================================================================\n", | |
" coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.0525 0.003 -18.216 0.000 -0.058 -0.047\n", | |
"x2 -0.2471 0.011 -23.272 0.000 -0.268 -0.226\n", | |
"x3 0.0353 0.002 19.302 0.000 0.032 0.039\n", | |
"x4 -0.0346 0.002 -21.439 0.000 -0.038 -0.031\n", | |
"x5 0.2717 0.012 22.200 0.000 0.248 0.296\n", | |
"x6 0.0339 0.001 60.098 0.000 0.033 0.035\n", | |
"x7 -0.0126 0.009 -1.366 0.172 -0.031 0.005\n", | |
"x8 0.0541 0.015 3.531 0.000 0.024 0.084\n", | |
"x9 0.2061 0.026 7.843 0.000 0.155 0.258\n", | |
"const 0.7004 0.011 62.741 0.000 0.678 0.722\n", | |
"==============================================================================" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Alternative solvers\n", | |
"\n", | |
"The default method for fitting discrete data MLE models is Newton-Raphson. You can use other solvers by using the ``method`` argument: " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"mlogit_res = mlogit_mod.fit(method='bfgs', maxiter=100)\n", | |
"print mlogit_res.summary()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Optimization terminated successfully.\n", | |
" Current function value: 1461.922747\n", | |
" Iterations: 54\n", | |
" Function evaluations: 98\n", | |
" Gradient evaluations: 89\n", | |
" MNLogit Regression Results \n", | |
"==============================================================================\n", | |
"Dep. Variable: y No. Observations: 944\n", | |
"Model: MNLogit Df Residuals: 908\n", | |
"Method: MLE Df Model: 30\n", | |
"Date: Sun, 26 Aug 2012 Pseudo R-squ.: 0.1648\n", | |
"Time: 20:48:54 Log-Likelihood: -1461.9\n", | |
"converged: True LL-Null: -1750.3\n", | |
" LLR p-value: 1.822e-102\n", | |
"==============================================================================\n", | |
" y=1 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.0115 0.034 -0.336 0.736 -0.079 0.056\n", | |
"x2 0.2977 0.094 3.180 0.001 0.114 0.481\n", | |
"x3 -0.0249 0.007 -3.823 0.000 -0.038 -0.012\n", | |
"x4 0.0825 0.074 1.121 0.262 -0.062 0.227\n", | |
"x5 0.0052 0.018 0.295 0.768 -0.029 0.040\n", | |
"const -0.3734 0.630 -0.593 0.553 -1.608 0.861\n", | |
"------------------------------------------------------------------------------\n", | |
" y=2 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.0888 0.039 -2.266 0.023 -0.166 -0.012\n", | |
"x2 0.3917 0.108 3.619 0.000 0.180 0.604\n", | |
"x3 -0.0229 0.008 -2.893 0.004 -0.038 -0.007\n", | |
"x4 0.1810 0.085 2.123 0.034 0.014 0.348\n", | |
"x5 0.0479 0.022 2.149 0.032 0.004 0.092\n", | |
"const -2.2509 0.763 -2.949 0.003 -3.747 -0.755\n", | |
"------------------------------------------------------------------------------\n", | |
" y=3 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.1060 0.057 -1.858 0.063 -0.218 0.006\n", | |
"x2 0.5735 0.159 3.617 0.000 0.263 0.884\n", | |
"x3 -0.0149 0.011 -1.311 0.190 -0.037 0.007\n", | |
"x4 -0.0072 0.126 -0.057 0.955 -0.255 0.240\n", | |
"x5 0.0576 0.034 1.713 0.087 -0.008 0.123\n", | |
"const -3.6656 1.157 -3.169 0.002 -5.932 -1.399\n", | |
"------------------------------------------------------------------------------\n", | |
" y=4 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.0916 0.044 -2.091 0.037 -0.177 -0.006\n", | |
"x2 1.2788 0.129 9.921 0.000 1.026 1.531\n", | |
"x3 -0.0087 0.008 -1.031 0.302 -0.025 0.008\n", | |
"x4 0.1998 0.094 2.123 0.034 0.015 0.384\n", | |
"x5 0.0845 0.026 3.226 0.001 0.033 0.136\n", | |
"const -7.6138 0.958 -7.951 0.000 -9.491 -5.737\n", | |
"------------------------------------------------------------------------------\n", | |
" y=5 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.0933 0.039 -2.371 0.018 -0.170 -0.016\n", | |
"x2 1.3470 0.117 11.494 0.000 1.117 1.577\n", | |
"x3 -0.0179 0.008 -2.352 0.019 -0.033 -0.003\n", | |
"x4 0.2169 0.085 2.552 0.011 0.050 0.384\n", | |
"x5 0.0810 0.023 3.524 0.000 0.036 0.126\n", | |
"const -7.0605 0.844 -8.362 0.000 -8.715 -5.406\n", | |
"------------------------------------------------------------------------------\n", | |
" y=6 coef std err z P>|z| [95.0% Conf. Int.]\n", | |
"------------------------------------------------------------------------------\n", | |
"x1 -0.1409 0.042 -3.343 0.001 -0.223 -0.058\n", | |
"x2 2.0701 0.143 14.435 0.000 1.789 2.351\n", | |
"x3 -0.0094 0.008 -1.160 0.246 -0.025 0.007\n", | |
"x4 0.3219 0.091 3.534 0.000 0.143 0.500\n", | |
"x5 0.1089 0.025 4.304 0.000 0.059 0.158\n", | |
"const -12.1058 1.060 -11.421 0.000 -14.183 -10.028\n", | |
"==============================================================================" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 14 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment