Skip to content

Instantly share code, notes, and snippets.

@grisaitis
Created October 12, 2019 18:44
Show Gist options
  • Save grisaitis/cf481034bb413a14d3ea851dab201d31 to your computer and use it in GitHub Desktop.
Save grisaitis/cf481034bb413a14d3ea851dab201d31 to your computer and use it in GitHub Desktop.
stackoverflow.com/questions/22381497
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import linear_model\n",
"randn = np.random.randn\n",
"\n",
"X = pd.DataFrame(randn(10,3), columns=['X1','X2','X3'])\n",
"y = pd.DataFrame(randn(10,1), columns=['Y']) \n",
"\n",
"model = linear_model.LinearRegression()\n",
"model.fit(X=X, y=y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## show scikit-learn results"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"from scikit-learn\n",
"[-0.28671532]\n",
"[[ 0.17501115 -0.6928708 0.22336584]]\n"
]
}
],
"source": [
"print(\"from scikit-learn\")\n",
"print(model.intercept_)\n",
"print(model.coef_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## reproduce scikit-learn results with linear algebra"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"N = len(X)\n",
"p = len(X.columns) + 1 # plus one because LinearRegression adds an intercept term"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"X_with_intercept = np.empty(shape=(N, p), dtype=np.float)\n",
"X_with_intercept[:, 0] = 1\n",
"X_with_intercept[:, 1:p] = X.values"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.28671532]\n",
" [ 0.17501115]\n",
" [-0.6928708 ]\n",
" [ 0.22336584]]\n"
]
}
],
"source": [
"beta_hat = np.linalg.inv(X_with_intercept.T @ X_with_intercept) @ X_with_intercept.T @ y.values\n",
"print(beta_hat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## compute standard errors of the parameter estimates"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SE(beta_hat[0]): 0.2468580488280805\n",
"SE(beta_hat[1]): 0.2965501221823944\n",
"SE(beta_hat[2]): 0.3518847753610169\n",
"SE(beta_hat[3]): 0.3250760291745124\n"
]
}
],
"source": [
"y_hat = model.predict(X)\n",
"residuals = y.values - y_hat\n",
"residual_sum_of_squares = residuals.T @ residuals\n",
"sigma_squared_hat = residual_sum_of_squares[0, 0] / (N - p)\n",
"var_beta_hat = np.linalg.inv(X_with_intercept.T @ X_with_intercept) * sigma_squared_hat\n",
"for p_ in range(p):\n",
" standard_error = var_beta_hat[p_, p_] ** 0.5\n",
" print(f\"SE(beta_hat[{p_}]): {standard_error}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## confirm with `statsmodels`"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y R-squared: 0.428\n",
"Model: OLS Adj. R-squared: 0.143\n",
"Method: Least Squares F-statistic: 1.499\n",
"Date: Sat, 12 Oct 2019 Prob (F-statistic): 0.307\n",
"Time: 14:32:46 Log-Likelihood: -8.7045\n",
"No. Observations: 10 AIC: 25.41\n",
"Df Residuals: 6 BIC: 26.62\n",
"Df Model: 3 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -0.2867 0.247 -1.161 0.290 -0.891 0.317\n",
"x1 0.1750 0.297 0.590 0.577 -0.551 0.901\n",
"x2 -0.6929 0.352 -1.969 0.096 -1.554 0.168\n",
"x3 0.2234 0.325 0.687 0.518 -0.572 1.019\n",
"==============================================================================\n",
"Omnibus: 1.786 Durbin-Watson: 2.439\n",
"Prob(Omnibus): 0.409 Jarque-Bera (JB): 0.888\n",
"Skew: -0.307 Prob(JB): 0.641\n",
"Kurtosis: 1.675 Cond. No. 1.91\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
]
}
],
"source": [
"import statsmodels.api as sm\n",
"ols = sm.OLS(y.values, X_with_intercept)\n",
"ols_result = ols.fit()\n",
"print(ols_result.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment