allisonmorgan · April 23, 2021 23:56
diff --git a/logistic_example.ipynb b/logistic_example.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "operational-tomorrow",
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.api as sm\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "scheduled-somerset",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "gender\n",
       "female    0.443131\n",
       "male      0.208086\n",
       "Name: attrition, dtype: float64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = []\n",
    "\n",
    "N_men = 700; men_max_career_length = 10\n",
    "N_women = 300; women_max_career_length = 5\n",
    "\n",
    "for i in range(N_men):\n",
    "    leave = np.random.randint(0, men_max_career_length)\n",
    "    for each in [0]*(leave-1) + [1]:\n",
    "        data.append(['male', each])\n",
    "\n",
    "for j in range(N_women):\n",
    "    leave = np.random.randint(0, women_max_career_length)\n",
    "    for each in [0]*(leave-1) + [1]:\n",
    "        data.append(['female', each])\n",
    "        \n",
    "df = pd.DataFrame(data, columns=['gender', 'attrition'])\n",
    "df.groupby(['gender'])['attrition'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "forced-track",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "gender  attrition\n",
       "female  0             377\n",
       "        1             300\n",
       "male    0            2664\n",
       "        1             700\n",
       "Name: attrition, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(['gender'])['attrition'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "rural-python",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2627627627627628\n",
      "0.7957559681697614\n"
     ]
    }
   ],
   "source": [
    "women = df[df.gender == 'female']['attrition'].value_counts()\n",
    "men = df[df.gender == 'male']['attrition'].value_counts()\n",
    "\n",
    "print((men[1]/(men[0] + men[1]))/(men[0]/(men[0] + men[1])))\n",
    "print((women[1]/(women[0] + women[1]))/(women[0]/(women[0] + women[1])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "otherwise-repeat",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.540770\n",
      "         Iterations 5\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>Logit Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>       <td>attrition</td>    <th>  No. Observations:  </th>  <td>  4041</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                 <td>Logit</td>      <th>  Df Residuals:      </th>  <td>  4039</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>                 <td>MLE</td>       <th>  Df Model:          </th>  <td>     1</td>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>            <td>Fri, 23 Apr 2021</td> <th>  Pseudo R-squ.:     </th>  <td>0.03353</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                <td>17:55:50</td>     <th>  Log-Likelihood:    </th> <td> -2185.3</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>converged:</th>             <td>True</td>       <th>  LL-Null:           </th> <td> -2261.1</td> \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>     <td>nonrobust</td>    <th>  LLR p-value:       </th> <td>7.646e-35</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>z</th>      <th>P>|z|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th>     <td>   -1.3365</td> <td>    0.042</td> <td>  -31.467</td> <td> 0.000</td> <td>   -1.420</td> <td>   -1.253</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>is_female</th> <td>    1.1080</td> <td>    0.088</td> <td>   12.554</td> <td> 0.000</td> <td>    0.935</td> <td>    1.281</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                           Logit Regression Results                           \n",
       "==============================================================================\n",
       "Dep. Variable:              attrition   No. Observations:                 4041\n",
       "Model:                          Logit   Df Residuals:                     4039\n",
       "Method:                           MLE   Df Model:                            1\n",
       "Date:                Fri, 23 Apr 2021   Pseudo R-squ.:                 0.03353\n",
       "Time:                        17:55:50   Log-Likelihood:                -2185.3\n",
       "converged:                       True   LL-Null:                       -2261.1\n",
       "Covariance Type:            nonrobust   LLR p-value:                 7.646e-35\n",
       "==============================================================================\n",
       "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const         -1.3365      0.042    -31.467      0.000      -1.420      -1.253\n",
       "is_female      1.1080      0.088     12.554      0.000       0.935       1.281\n",
       "==============================================================================\n",
       "\"\"\""
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['is_female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)\n",
    "X = df['is_female']\n",
    "X = sm.add_constant(X)\n",
    "y = df['attrition']\n",
    "   \n",
    "log_reg = sm.Logit(y, X).fit()\n",
    "log_reg.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "flush-stamp",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.2627627627627629\n",
      "0.7957559681697618\n"
     ]
    }
   ],
   "source": [
    "print(np.exp(log_reg.params['const']))\n",
    "print(np.exp(log_reg.params['const'] + log_reg.params['is_female']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "handy-cincinnati",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "operational-tomorrow",
	"metadata": {},
	"outputs": [],
	"source": [
	"import statsmodels.api as sm\n",
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "scheduled-somerset",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"gender\n",
	"female 0.443131\n",
	"male 0.208086\n",
	"Name: attrition, dtype: float64"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data = []\n",
	"\n",
	"N_men = 700; men_max_career_length = 10\n",
	"N_women = 300; women_max_career_length = 5\n",
	"\n",
	"for i in range(N_men):\n",
	" leave = np.random.randint(0, men_max_career_length)\n",
	" for each in [0]*(leave-1) + [1]:\n",
	" data.append(['male', each])\n",
	"\n",
	"for j in range(N_women):\n",
	" leave = np.random.randint(0, women_max_career_length)\n",
	" for each in [0]*(leave-1) + [1]:\n",
	" data.append(['female', each])\n",
	" \n",
	"df = pd.DataFrame(data, columns=['gender', 'attrition'])\n",
	"df.groupby(['gender'])['attrition'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "forced-track",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"gender attrition\n",
	"female 0 377\n",
	" 1 300\n",
	"male 0 2664\n",
	" 1 700\n",
	"Name: attrition, dtype: int64"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.groupby(['gender'])['attrition'].value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "rural-python",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.2627627627627628\n",
	"0.7957559681697614\n"
	]
	}
	],
	"source": [
	"women = df[df.gender == 'female']['attrition'].value_counts()\n",
	"men = df[df.gender == 'male']['attrition'].value_counts()\n",
	"\n",
	"print((men[1]/(men[0] + men[1]))/(men[0]/(men[0] + men[1])))\n",
	"print((women[1]/(women[0] + women[1]))/(women[0]/(women[0] + women[1])))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "otherwise-repeat",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Optimization terminated successfully.\n",
	" Current function value: 0.540770\n",
	" Iterations 5\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<table class=\"simpletable\">\n",
	"<caption>Logit Regression Results</caption>\n",
	"<tr>\n",
	" <th>Dep. Variable:</th> <td>attrition</td> <th> No. Observations: </th> <td> 4041</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>Model:</th> <td>Logit</td> <th> Df Residuals: </th> <td> 4039</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>Method:</th> <td>MLE</td> <th> Df Model: </th> <td> 1</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>Date:</th> <td>Fri, 23 Apr 2021</td> <th> Pseudo R-squ.: </th> <td>0.03353</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>Time:</th> <td>17:55:50</td> <th> Log-Likelihood: </th> <td> -2185.3</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>converged:</th> <td>True</td> <th> LL-Null: </th> <td> -2261.1</td> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>Covariance Type:</th> <td>nonrobust</td> <th> LLR p-value: </th> <td>7.646e-35</td>\n",
	"</tr>\n",
	"</table>\n",
	"<table class=\"simpletable\">\n",
	"<tr>\n",
	" <td></td> <th>coef</th> <th>std err</th> <th>z</th> <th>P>\|z\|</th> <th>[0.025</th> <th>0.975]</th> \n",
	"</tr>\n",
	"<tr>\n",
	" <th>const</th> <td> -1.3365</td> <td> 0.042</td> <td> -31.467</td> <td> 0.000</td> <td> -1.420</td> <td> -1.253</td>\n",
	"</tr>\n",
	"<tr>\n",
	" <th>is_female</th> <td> 1.1080</td> <td> 0.088</td> <td> 12.554</td> <td> 0.000</td> <td> 0.935</td> <td> 1.281</td>\n",
	"</tr>\n",
	"</table>"
	],
	"text/plain": [
	"<class 'statsmodels.iolib.summary.Summary'>\n",
	"\"\"\"\n",
	" Logit Regression Results \n",
	"==============================================================================\n",
	"Dep. Variable: attrition No. Observations: 4041\n",
	"Model: Logit Df Residuals: 4039\n",
	"Method: MLE Df Model: 1\n",
	"Date: Fri, 23 Apr 2021 Pseudo R-squ.: 0.03353\n",
	"Time: 17:55:50 Log-Likelihood: -2185.3\n",
	"converged: True LL-Null: -2261.1\n",
	"Covariance Type: nonrobust LLR p-value: 7.646e-35\n",
	"==============================================================================\n",
	" coef std err z P>\|z\| [0.025 0.975]\n",
	"------------------------------------------------------------------------------\n",
	"const -1.3365 0.042 -31.467 0.000 -1.420 -1.253\n",
	"is_female 1.1080 0.088 12.554 0.000 0.935 1.281\n",
	"==============================================================================\n",
	"\"\"\""
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df['is_female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)\n",
	"X = df['is_female']\n",
	"X = sm.add_constant(X)\n",
	"y = df['attrition']\n",
	" \n",
	"log_reg = sm.Logit(y, X).fit()\n",
	"log_reg.summary()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "flush-stamp",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.2627627627627629\n",
	"0.7957559681697618\n"
	]
	}
	],
	"source": [
	"print(np.exp(log_reg.params['const']))\n",
	"print(np.exp(log_reg.params['const'] + log_reg.params['is_female']))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "handy-cincinnati",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found