Last active
October 17, 2019 13:06
-
-
Save steven-tey/d2ce8270f57556abba788f4ed790a147 to your computer and use it in GitHub Desktop.
CS146 Session 6.2 Pre-Class Work
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'0.24.2'" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"from scipy import stats\n", | |
"import pandas as pd\n", | |
"pd.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>father</th>\n", | |
" <th>son</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>farm</td>\n", | |
" <td>farm</td>\n", | |
" <td>703</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>farm</td>\n", | |
" <td>unskilled</td>\n", | |
" <td>1478</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>farm</td>\n", | |
" <td>skilled</td>\n", | |
" <td>1430</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>farm</td>\n", | |
" <td>professional</td>\n", | |
" <td>1109</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>unskilled</td>\n", | |
" <td>farm</td>\n", | |
" <td>58</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>unskilled</td>\n", | |
" <td>unskilled</td>\n", | |
" <td>1756</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>unskilled</td>\n", | |
" <td>skilled</td>\n", | |
" <td>1630</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>unskilled</td>\n", | |
" <td>professional</td>\n", | |
" <td>1568</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>skilled</td>\n", | |
" <td>farm</td>\n", | |
" <td>63</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>skilled</td>\n", | |
" <td>unskilled</td>\n", | |
" <td>1453</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>skilled</td>\n", | |
" <td>skilled</td>\n", | |
" <td>2068</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>skilled</td>\n", | |
" <td>professional</td>\n", | |
" <td>2483</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>professional</td>\n", | |
" <td>farm</td>\n", | |
" <td>61</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>professional</td>\n", | |
" <td>unskilled</td>\n", | |
" <td>749</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>professional</td>\n", | |
" <td>skilled</td>\n", | |
" <td>1183</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>professional</td>\n", | |
" <td>professional</td>\n", | |
" <td>3315</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" father son count\n", | |
"0 farm farm 703\n", | |
"1 farm unskilled 1478\n", | |
"2 farm skilled 1430\n", | |
"3 farm professional 1109\n", | |
"4 unskilled farm 58\n", | |
"5 unskilled unskilled 1756\n", | |
"6 unskilled skilled 1630\n", | |
"7 unskilled professional 1568\n", | |
"8 skilled farm 63\n", | |
"9 skilled unskilled 1453\n", | |
"10 skilled skilled 2068\n", | |
"11 skilled professional 2483\n", | |
"12 professional farm 61\n", | |
"13 professional unskilled 749\n", | |
"14 professional skilled 1183\n", | |
"15 professional professional 3315" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv(\"https://gist.githubusercontent.com/cscheffler/482412b75d7b7c8ab83dd86e81d71403/raw/9ed65a9d2bb8455e4b225400f57f2d77f851aec9/socialmobility.csv\", sep=\",\")\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"As we can see from above, this is a multinomial distribution, since there are multiple identical independent trials where each trial has 𝑘 possible outcomes.\n", | |
"\n", | |
"We were also asked to choose a Dirichlet prior to have a uniform distribution over the probability vector parameter of the multinomial.\n", | |
"\n", | |
"Therefore, we will need to select a Dirichlet prior, which is the conjugate to a multinomial likelihood.\n", | |
"\n", | |
"According to <a href=\"https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions\">Wikipedia</a>, the Dirichlet prior has $\\alpha$ as its prior hyperparameter.\n", | |
"\n", | |
"Therefore, to calculate the posterior hyperparameters, we will have to use the following formula:\n", | |
"<br>\n", | |
"<center>${\\displaystyle {\\boldsymbol {\\alpha }}+\\sum _{i=1}^{n}\\mathbf {x} _{i}\\!}$</center>\n", | |
"\n", | |
"where $x$ is the number of individual trials." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"There were 4720 fathers that were farmers, 5012 fathers that were unskilled, 6067 fathers that were skilled, and 5308 fathers that were professionals.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Finding the number of trials conducted for each group\n", | |
"\n", | |
"farm = sum(df.loc[lambda df: df.father == 'farm', 'count'])\n", | |
"unskilled = sum(df.loc[lambda df: df.father == 'unskilled', 'count'])\n", | |
"skilled = sum(df.loc[lambda df: df.father == 'skilled', 'count'])\n", | |
"professional = sum(df.loc[lambda df: df.father == 'professional', 'count'])\n", | |
"print(f\"There were {farm} fathers that were farmers, {unskilled} fathers that were unskilled, {skilled} fathers that were skilled, and {professional} fathers that were professionals.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Generating 4 samples:\n", | |
"[[0.07015414 0.14429606 0.34790591 0.43764389]\n", | |
" [0.18763825 0.13502804 0.41537149 0.26196223]\n", | |
" [0.08321873 0.30398893 0.22893338 0.38385896]\n", | |
" [0.105482 0.42474641 0.2931602 0.17661139]]\n", | |
"\n", | |
"Sum of each sample (should be 1):\n", | |
"[1. 1. 1. 1.]\n", | |
"\n", | |
"Value of probability density function at each sample:\n", | |
"[44.27132166 12.66470665 27.25039761 6.08103741]\n" | |
] | |
} | |
], | |
"source": [ | |
"# Setting up the Dirichlet prior\n", | |
"\n", | |
"dirichlet = stats.dirichlet([1, 1, 1, 1])\n", | |
"num_samples = 4\n", | |
"print('Generating', num_samples, 'samples:')\n", | |
"samples = dirichlet.rvs(size=num_samples)\n", | |
"print(samples)\n", | |
"print('')\n", | |
"print('Sum of each sample (should be 1):')\n", | |
"print(samples.sum(axis=1))\n", | |
"print('')\n", | |
"print('Value of probability density function at each sample:')\n", | |
"print(dirichlet.pdf(samples.transpose()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Generating 4 samples for father == farm:\n", | |
"[[1144 1202 1190 1184]\n", | |
" [1202 1205 1189 1124]\n", | |
" [1192 1161 1176 1191]\n", | |
" [1200 1136 1237 1147]]\n", | |
"\n", | |
"Value of probability mass function at farm samples:\n", | |
"[1.39702523e-06 4.92175621e-07 2.38520122e-06 1.86885540e-07]\n", | |
"\n", | |
"Dirichlet posterior is:\n", | |
"[6.18481533e-05 6.23325987e-06 6.49976817e-05 1.13645796e-06]\n" | |
] | |
} | |
], | |
"source": [ | |
"# For father == farm:\n", | |
"mult_farm = stats.multinomial(farm, [1/4, 1/4, 1/4, 1/4])\n", | |
"num_samples = 4\n", | |
"print('Generating', num_samples, 'samples for father == farm:')\n", | |
"samples_farm = mult_farm.rvs(size=num_samples)\n", | |
"print(samples_farm)\n", | |
"print('')\n", | |
"print('Value of probability mass function at farm samples:')\n", | |
"print(mult_farm.pmf(samples_farm))\n", | |
"print('')\n", | |
"\n", | |
"# Multiplying the Dirichlet prior by the multinomial likelihood:\n", | |
"print('Dirichlet posterior is:')\n", | |
"print(dirichlet.pdf(samples.transpose())*mult_farm.pmf(samples_farm))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Generating 4 samples for father == unskilled:\n", | |
"[[1264 1340 1227 1181]\n", | |
" [1246 1258 1231 1277]\n", | |
" [1286 1252 1210 1264]\n", | |
" [1232 1241 1222 1317]]\n", | |
"\n", | |
"Value of probability mass function at unskilled samples:\n", | |
"[1.31911604e-08 1.82139939e-06 8.40623261e-07 3.08477345e-07]\n", | |
"\n", | |
"Dirichlet posterior is:\n", | |
"[5.83990104e-07 2.30674889e-05 2.29073181e-05 1.87586228e-06]\n" | |
] | |
} | |
], | |
"source": [ | |
"# For father == unskilled:\n", | |
"mult_unskilled = stats.multinomial(unskilled, [1/4, 1/4, 1/4, 1/4])\n", | |
"num_samples = 4\n", | |
"print('Generating', num_samples, 'samples for father == unskilled:')\n", | |
"samples_unskilled = mult_unskilled.rvs(size=num_samples)\n", | |
"print(samples_unskilled)\n", | |
"print('')\n", | |
"print('Value of probability mass function at unskilled samples:')\n", | |
"print(mult_unskilled.pmf(samples_unskilled))\n", | |
"print('')\n", | |
"\n", | |
"# Multiplying the Dirichlet prior by the multinomial likelihood:\n", | |
"print('Dirichlet posterior is:')\n", | |
"print(dirichlet.pdf(samples.transpose())*mult_unskilled.pmf(samples_unskilled))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Generating 4 samples for father == skilled:\n", | |
"[[1528 1523 1495 1521]\n", | |
" [1521 1527 1512 1507]\n", | |
" [1495 1488 1544 1540]\n", | |
" [1516 1543 1524 1484]]\n", | |
"\n", | |
"Value of probability mass function at skilled samples:\n", | |
"[1.72995639e-06 1.98539702e-06 9.17490927e-07 1.18041262e-06]\n", | |
"\n", | |
"Dirichlet posterior is:\n", | |
"[7.65874557e-05 2.51444708e-05 2.50019926e-05 7.17813332e-06]\n" | |
] | |
} | |
], | |
"source": [ | |
"# For father == skilled:\n", | |
"mult_skilled = stats.multinomial(skilled, [1/4, 1/4, 1/4, 1/4])\n", | |
"num_samples = 4\n", | |
"print('Generating', num_samples, 'samples for father == skilled:')\n", | |
"samples_skilled = mult_skilled.rvs(size=num_samples)\n", | |
"print(samples_skilled)\n", | |
"print('')\n", | |
"print('Value of probability mass function at skilled samples:')\n", | |
"print(mult_skilled.pmf(samples_skilled))\n", | |
"print('')\n", | |
"\n", | |
"# Multiplying the Dirichlet prior by the multinomial likelihood:\n", | |
"print('Dirichlet posterior is:')\n", | |
"print(dirichlet.pdf(samples.transpose())*mult_skilled.pmf(samples_skilled))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Generating 4 samples for father == professional:\n", | |
"[[1338 1363 1335 1272]\n", | |
" [1317 1332 1375 1284]\n", | |
" [1363 1275 1281 1389]\n", | |
" [1342 1355 1291 1320]]\n", | |
"\n", | |
"Value of probability mass function at professional samples:\n", | |
"[4.75625496e-07 5.25629175e-07 6.18915653e-08 1.07985813e-06]\n", | |
"\n", | |
"Dirichlet posterior is:\n", | |
"[2.10565693e-05 6.65693931e-06 1.68656976e-06 6.56665766e-06]\n" | |
] | |
} | |
], | |
"source": [ | |
"# For father == farm:\n", | |
"mult_professional = stats.multinomial(professional, [1/4, 1/4, 1/4, 1/4])\n", | |
"num_samples = 4\n", | |
"print('Generating', num_samples, 'samples for father == professional:')\n", | |
"samples_professional = mult_professional.rvs(size=num_samples)\n", | |
"print(samples_professional)\n", | |
"print('')\n", | |
"print('Value of probability mass function at professional samples:')\n", | |
"print(mult_professional.pmf(samples_professional))\n", | |
"print('')\n", | |
"\n", | |
"# Multiplying the Dirichlet prior by the multinomial likelihood:\n", | |
"print('Dirichlet posterior is:')\n", | |
"print(dirichlet.pdf(samples.transpose())*mult_professional.pmf(samples_professional))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I'm not sure how to calculate the 95% confidence interval over the posterior probability - the values that I got above seem awfully small." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment