Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save CleverProgrammer/9d9bf37d51f7bc6b1f93 to your computer and use it in GitHub Desktop.
Save CleverProgrammer/9d9bf37d51f7bc6b1f93 to your computer and use it in GitHub Desktop.
Udacity Intro to Descriptive Statistics
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 261,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"application/javascript": [
"IPython.notebook.set_autosave_interval(120000)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Autosaving every 120 seconds\n"
]
}
],
"source": [
"%autosave 120\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# For visualization\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from IPython.display import SVG, display\n",
"from IPython.display import YouTubeVideo\n",
"%matplotlib inline\n",
"\n",
"# Useful Links\n",
"LaTeX_url = 'http://latex.wikia.com/wiki/List_of_LaTeX_symbols'\n",
"from IPython.display import Image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#ex 1. BBC Memory Test"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"memory_df = pd.read_csv('downloads/Sample Memory Scores - PS3, PS4 - Sheet1.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recognition Score</th>\n",
" <th>Temporal Memory Score</th>\n",
" <th>Average Recognition</th>\n",
" <th>Average Temporal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>91</td>\n",
" <td>86</td>\n",
" <td>93.115556</td>\n",
" <td>78.026667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>95</td>\n",
" <td>78</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>95</td>\n",
" <td>56</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>91</td>\n",
" <td>81</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>100</td>\n",
" <td>75</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Recognition Score Temporal Memory Score Average Recognition \\\n",
"0 91 86 93.115556 \n",
"1 95 78 NaN \n",
"2 95 56 NaN \n",
"3 91 81 NaN \n",
"4 100 75 NaN \n",
"\n",
" Average Temporal \n",
"0 78.026667 \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"memory_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Recognition Score 225\n",
"Temporal Memory Score 225\n",
"Average Recognition 1\n",
"Average Temporal 1\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"memory_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Recognition Score 93.115556\n",
"Temporal Memory Score 78.026667\n",
"Average Recognition 93.115556\n",
"Average Temporal 78.026667\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"memory_df.mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"78-68"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The mean, or average, recognition score is 93.1%\n",
"The mean, or average, temporal memory score is 78.0%\n"
]
}
],
"source": [
"print ('The mean, or average, recognition score is %1.1f%%' %(memory_df['Recognition Score'].mean()))\n",
"print ('The mean, or average, temporal memory score is %1.1f%%' %(memory_df['Temporal Memory Score'].mean()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#ex 2. National Hockey League"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"hockey_df = pd.read_csv('downloads/NHL - Problem Set 3 - Sheet1.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**a.) What is the mean, or average, height of players from Detroit Red Wings and San Jose Sharks?**"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Detroit Red Wings 72.857143\n",
"San Jose Sharks 73.833333\n",
"dtype: float64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hockey_df.mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**b.) What is the most frequently occuring height of players from Detroit Red Wings and San Jose Sharks?**"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Detroit Red Wings</th>\n",
" <th>San Jose Sharks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>72</td>\n",
" <td>74</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Detroit Red Wings San Jose Sharks\n",
"0 72 74"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hockey_df.mode()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**c.) What is the median height, or height right in between, of the players from Detroit Red Wings and San Jose Sharks?**"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Detroit Red Wings 72.5\n",
"San Jose Sharks 74.0\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hockey_df.median()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#ex 3. Deal or No Deal"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Briefcase USD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>25.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Briefcase USD\n",
"0 0.01\n",
"1 1.00\n",
"2 5.00\n",
"3 10.00\n",
"4 25.00"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"deal_df = pd.read_csv('downloads/Deal or No Deal- - Problem Set 3 - Sheet1.csv')\n",
"deal_df.columns = ['Briefcase USD']\n",
"deal_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**a.) **What is the **median** amount, or amount right in between, of the briefcases from the Deal or No Deal data set?"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The median value: $875.00\n"
]
}
],
"source": [
"print (\"The median value: $%1.2f\" %deal_df['Briefcase USD'].median())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**b.) **What is the **mode** amount, or most frequently occuring amount, of the briefcases from the Deal or No Deal data set?"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No mode\n"
]
}
],
"source": [
"# in order to use the np.dtype on the data, I have to convert it into a series. Since when there is no mode it returns\n",
"# the column name, I am using np.object0 to find out whether it is a string. I believe object0 is a string object\n",
"# in a dataframe. Though I have to investigate further.\n",
"\n",
"ser1 = pd.Series(deal_df)\n",
"if np.dtype(ser1).type is np.object0:\n",
" print ('No mode')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**c.) **How many briefcases carry **less** the median?. "
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Values above the median: 13\n"
]
}
],
"source": [
"# Rename column\n",
"deal_df.columns = ['Briefcase USD']\n",
"\n",
"# Filtered values above the median\n",
"above_median = deal_df[deal_df['Briefcase USD'] > deal_df['Briefcase USD'].median()]\n",
"\n",
"# count above mean values and pull the values and not the categorical \"Briefcase USD\".\n",
"print (\"Values above the median: %1.0f\" %above_median.count().values[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**d.) **How many briefcases carry **above** the median?. "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Values below the median: 13\n"
]
}
],
"source": [
"# Filtered values below the median\n",
"below_median = deal_df[deal_df['Briefcase USD'] < deal_df['Briefcase USD'].median()]\n",
"\n",
"# count above mean values and pull the values and not the categorical \"Briefcase USD\".\n",
"print (\"Values below the median: %1.0f\" %below_median.count().values[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Mean > Median** in this dataset\n",
"\n",
"The mean of this dataset would most likely be greater than the median since the high USD briefcases are pulling it to the right"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**e.) **What is the **mean** amount, or the average amount, of the briefcases from the Deal or No Deal data set?"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean: $131477.54\n"
]
}
],
"source": [
"print (\"Mean: $%1.2f\" %(deal_df.mean().values[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**f.)** What is the **proportion** of the briefcases above the mean from the Deal or No Deal data set?"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Values above the mean: 6\n"
]
}
],
"source": [
"# Filtered values above the mean\n",
"above_mean = deal_df[deal_df['Briefcase USD'] > deal_df['Briefcase USD'].mean()]\n",
"\n",
"# Count the mean values.\n",
"print (\"Values above the mean: %1.0f\" %above_mean.count().values[0])"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Proportion above the mean: 0.23\n"
]
}
],
"source": [
"above_mean_proportion = ( above_mean.count().values[0] ) / ( len(deal_df['Briefcase USD']) )\n",
"print (\"Proportion above the mean: %1.2f\" %above_mean_proportion)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x10a66d5f8>"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAERCAYAAABy/XBZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFLtJREFUeJzt3XuQJWV5x/HvDLAM7A5rBQ9aJsRrfIixEhSSjavCboko\nELyQi4kYLwmoERMTLVchCVoEFWNJFDVEUcRLrlIbSxM0mIi31SgmiFLqoyTZmKoYneXizuzuMCwz\n+aN7POMyt3PobnZ4v58qinN6+3Q//czM+Z3uPv32yNzcHJKk8oze2wVIku4dBoAkFcoAkKRCGQCS\nVCgDQJIKZQBIUqEOXWmGiNgEXJKZWyPiOODdwBzwLeCczPR7pJK0Bi27BxAR24ArgMPrSa8FLs7M\nJ9bTzmi1OklSa1Y6BHQzcBYwUj/fBxwdESPAODDTYm2SpBYtGwCZuR3Yv2DS24C3Al8HjgE+3V5p\nkqQ2DXoS+IPAEzPzp4EPAG9uviRJUhdWPAl8gCOByfrxd4HNK73gY1/4r7mRHx5Buudm9uziaaec\n2NjyJOkg1dwb5xJWGwDz3/Q5B7g6IqaBO4BzV3rhCCNMTk0PWd7dzUxNMzExufKMB6Feb3zN1t40\ne9FnL/rsRV+vN976OlYMgMzcSf1JPzP/GfjnlmuSJHXAC8EkqVAGgCQVygCQpEIZAJJUKANAkgpl\nAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUCve\nESwiNgGXZObWiDgGuAK4H9X9Kp9b3zFMkrTGLLsHEBHbqN7wD68n/Snwgcw8GbgQeHS75UmS2rLS\nIaCbgbPo351+M3BsRHwCOBv4ZIu1SZJatGwAZOZ2YP+CSQ8Bbs3MJwPfAV7VXmmSpDateA7gALcA\nH6kffxR43WpeNL5hbMDVLG1mZIxeb7yx5XVtLdfeNHvRZy/67EV3Bg2AzwFnAB8ETgZuWs2LJqem\nB1zN0mamppmYmGxseV3q9cbXbO1Nsxd99qLPXvR1EYSrDYC5+v+vAN4dEb8D3A48u5WqJEmtWzEA\n6q95bq4ffwc4teWaJEkd8EIwSSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkq\nlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKtSKARARmyLiugOmPTsiPt9eWZKk\nti17S8iI2AY8B5haMO0xwG+1XJckqWUr7QHcDJwFjABExNHA64Dfn58mSVqblg2AzNwO7AeIiFHg\nPcDLWbBHIElam5Y9BHSAE4BHAJcDY8CjIuLSzHz5Si8c3zA2ZHl3NzMyRq833tjyuraWa2+aveiz\nF332ojurDoDMvB54NEBEPBj4m9W8+QNMTk0PV90iZqammZiYbGx5Xer1xtds7U2zF332os9e9HUR\nhKv9GujcAc9HFpkmSVpDVtwDyMydwOaVpkmS1hYvBJOkQhkAklQoA0CSCmUASFKhDABJKpQBIEmF\nMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCrXhHsIjY\nBFySmVsj4njgMuAu4A7guZn5/ZZrlCS1YNk9gIjYBlwBHF5Pegvw0szcCmwHXtVueZKktqx0COhm\n4Cyqm8AD/HpmfrV+fBiwr63CJEntWjYAMnM7sH/B8/8DiIjNwHnAn7VanSSpNSueAzhQRDwLuAA4\nPTNvWc1rxjeMDbqaJc2MjNHrjTe2vK6t5dqbZi/67EWfvejOQAEQEc8BXghsyczbVvu6yanpQeta\n0szUNBMTk40tr0u93viarb1p9qLPXvTZi74ugnC1XwOdi4hR4K3ABmB7RFwXEa9trTJJUqtW3API\nzJ3A5vrp0a1WI0nqjBeCSVKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwA\nSSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKteINYSJiE3BJZm6NiEcAVwGzwE3AeZk5126J\nkqQ2LLsHEBHbgCuAw+tJlwIXZOZJwAjw9HbLkyS1ZaVDQDcDZ1G92QM8NjM/Uz/+GHBKW4VJktq1\nbABk5nZg/4JJIwseTwEb2yhKktS+QU8Czy54PA7c3mAtkqQOrXgS+AA3RMTJmflp4DTgX1bzovEN\nYwMXtpSZkTF6vfHGlte1tVx70+xFn73osxfdWW0AzH/T5xXAFRGxDvg6cPVqXjw5NT1EaYubmZpm\nYmKyseV1qdcbX7O1N81e9NmLPnvR10UQrhgAmbkT2Fw//jawpd2SJEld8EIwSSqUASBJhTIAJKlQ\nBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUA\nSFKhDABJKtSg9wQmIkaBdwOPpLpJ/LmZmU0XJklq1zB7AKcC6zPzCcBFwOuaLUmS1IVhAmAfsDEi\nRoCNwEyzJUmSujDwISBgBzAGfBM4Gjiz0YokSZ0YZg9gG7AjMwM4HnhfRKxrtixJUtuG2QNYD+yu\nH98GHAYcstwLxjeMDbGaxc2MjNHrjTe2vK6t5dqbZi/67EWfvejOMAHwJuC9EfFZqjf/8zNz33Iv\nmJyaHqa2Rc1MTTMxMdnY8rrU642v2dqbZi/67EWfvejrIggHDoDMvB14Zgu1SJI65IVgklQoA0CS\nCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQ\nBoAkFcoAkKRCGQCSVKhhbglJRJwPnEl1S8i3Z+b7Gq1KktS6gfcAImIL8LjM3AxsAR7WcE2SpA4M\nswdwKvC1iPgwcBTwymZLkiR1YZgA6AHHAr9E9en/I8BxTRYlSWrfMAGwC/hGZu4HvhUR0xFx/8zc\ntdQLxjeMDV3ggWZGxuj1xhtbXtfWcu1Nsxd99qLPXnRnmAD4HPAy4NKIeBCwHrhluRdMTk0PsZrF\nzUxNMzEx2djyutTrja/Z2ptmL/rsRZ+96OsiCAc+CZyZ/wjcEBFfojr885LMnGu8MklSq4b6Gmhm\nvqrpQiRJ3fJCMEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAk\nqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSrUUHcEA4iIY4B/A56Umd9qriRJUheG2gOIiMOA\ndwJ7mi1HktSVYQ8BvQm4HPhug7VIkjo0cABExPOBicy8tp400mhFkqROjMzNzQ30goj4NDBX/3c8\nkMDTM/N7i83/8S/sHGwFK5jZs4unnXJik4uUpINR6x+uBz4JnJknzz+OiOuAFy315j9vcmp6iNIW\nNzM1zcTEZGPL61KvN75ma2+aveizF332oq/XG299HX4NVJIKNfTXQAEyc2tThUiSuuUegCQVygCQ\npEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkq\nlAEgSYUyACSpUAaAJBVq4DuCRcRhwJXAg4HDgYsz86NNFyZJatcwewBnAxOZeRLwVODtzZYkSerC\nMPcE/hBwdf14FNjfXDmSpK4MHACZuQcgIsapwuAPmy5qObOzs+ze/YPGl7thwzijo54SkVSOkbm5\nuYFfFBHHAtuBd2TmVcvN+/Ev7Bx8Bcu47fs72c/hHHnk+saWuXfvHp625VFs3LixsWVK0j000vYK\nhjkJ/ADgWuAlmXndal4zOTU96GqWNDU5zboN65llXWPLnJ27g127JpmZaXcPoNcbZ2JistV1rBX2\nos9e9NmLvl5vvPV1DHMO4AJgI3BhRFxYTzstM5t7l5cktW6YcwAvA17WQi2SpA551lOSCmUASFKh\nDABJKpQBIEmFMgAkqVAGgCQVygCQpEINcyGYVmF2dpapqR+9onHdull27x7+KsfZ2VmARscsanoM\npMW2ezGD9KKN7YZyx39a7c9oUPazWQfrlcBahampST7xxZs5YsGYRRvW38rUnjuGXuatu77H6Oih\n3O/Hjm6iRPbt3cOTNz2Co45qbgykxbZ7MYP0ounthna2fa1Y7c9oEPaz+X4+/OE/0djylmIAtOiI\nI9dz5Pp+iq/fMMYsw4+YsXfPFKOjh/zIMg9GB273YgbpxVrZ7rVkNT8jrd5a7Wd5+2uSJMAAkKRi\nGQCSVCgDQJIKZQBIUqEMAEkq1DC3hBwF/hz4WeAO4JzM/I+mC5MktWuYPYBnAOsyczPwauDNzZYk\nSerCMAHweODjAJn5ReDERiuSJHVimAA4Cti94Pld9WEhSdIaMsxQELuBhdc8j2bm7FIz7/3B99k7\nNfzwBwe6c2Yvd+0da2x5UI27MTm5e+UZBzA5uZt9e/f8yLRRZth7D8YCmt63h9HRQ9m7p5mBp7ra\n7sUM0oumtxva2fZh3dNBAge12p/RIJrqZ9e9aEJb/ezCyNzc3EAviIizgDMz8wUR8YvAH2fmGa1U\nJ0lqzTB7AH8PPDkidtTPX9BgPZKkjgy8ByBJum/w5K0kFcoAkKRCGQCSVCgDQJIK1cotIe+L4wVF\nxGHAlcCDgcOBi4FvAFcBs8BNwHmZORcR5wIvBPYDF2fmP0bEEcAHgR4wCTwvM3fVX6V9Sz3vtZl5\nUb2+1wCn19N/PzOv72xjVykijgH+DXgSVQ+uosBeRMT5wJnAYcDbgR0U2Iv67/7dwCOptv1c4C4K\n60VEbAIuycytEfEIOtz+iLg/8FfAGPC/wAsyc99Stba1B3BfHC/obGAiM08Cngq8g2q7LqinjQBP\nj4gHAr8LbAaeArwhItYBvwPcWM/7fuCP6uX+BfAbmfkEYFNEHB8RjwVOysxNwK/X6zqo1IH4TmAP\n1bZfSoG9iIgtwOPq3/UtwMMo9/fiVGB9XfNFwOsprBcRsQ24gupDInT/d3Eh8MF6GTcAL1qu3rYC\n4L44XtCHqJoLVd/uBB6bmZ+pp30MOAX4eWBHZt6ZmbuBm6n2hH7Yk/r/p0TEOFVQ/lc9/Z/qZTwe\nuBYgM/8HODQijm5z44bwJuBy4Lv181J7cSrwtYj4MPBR4CPACYX2Yh+wMSJGgI3ADOX14mbgLKo3\ne+j27+L+Byxjfn1LaisA7nPjBWXmnsycqn8gH6JK54XbNEn1S38U8IMlpu9eZtpqlnFQiIjnU+0N\nXVtPGqH/Cw8F9YJqd/0E4FeAF1Ptfpfaix1Uhx6+SbV3eBmF9SIzt1MdkpnX9fYvnD7FCj1p6015\noPGC1oqIOBb4JPD+zPxrquN6844Cbufu2z6+yPTFpq1mGQeLF1BdDX4dcDzwPqo3wnkl9WIX1XHZ\n/Zn5LWCaH/2jK6kX26g+2QbV78X7qc6LzCupF/O6fo/YXc+zcNqS2gqAHVQnJ6hPYHy1pfV0JiIe\nQLXLtS0zr6on3xARJ9ePTwM+A3wJeGJEHB4RG4Gfpjr588OezM+bmZPATEQ8rN5tPrVexg7gKREx\nEhE/SRWgt7a/lauTmSdn5pbM3Ap8BXgu8PESewF8juqcEBHxIOBI4F8K7cV6+p9Wb6P6kkmRfyML\ndLX9I5l5y2LLWK64Vr4FxH1zvKALqD7ZXRgR8+cCXgZcVp/A+TpwdX2G/zLgs1QBe0Fm3hERlwPv\ni4jPUn0z6tn1Ml4M/CVwCPBP899kqOf7Qr2Ml3SyhcObA14BXFFaL+pvb5wUEV+iX99OCuwF1Xmh\n99Y1HgacT/UtsRJ7MT/GTld/F+fV815cL+NcYGLBMhblWECSVKg1fWJWkjQ8A0CSCmUASFKhDABJ\nKpQBIEmFMgAkqVBtXQcg3U09cNo/AN+mukR+HdXAVa9fZN4HAVdk5hnLLO8oqiuzR4FnZea326j7\ngHU+BLguMx96wPTZzBytH58HnEO1jXPApZn5gfrfdlINoDdDtf23AX9wsIxkqbIYAOra9fUVxETE\neuAbEbE9M7+5cKbM/F9gyTf/2vHAHZn5+HZKHVxUQwH/NvCL9cU9PeDLEfGVzPwaVSCclpnfqec/\nHbgmIo6rr+SUOmMA6N60gWq8+B/ADz8d/yvVG/tvAh/KzIfUw3D8BXAs1dgq51MNL3Il8IB6JM5n\nUd2D4vFUI7X+SWb+XUT8KvBy4Ij6v3My87MR8XKqISxmgS9l5osj4hCqq1lPprrq8qrMfMuA2/RA\nqk/+66nCaSIifplqzKC7ycxr6quInw28bcB1SfeI5wDUtRMj4oaIuBH4T6rDKfNDSs8B12TmcVSX\nsc9fpv5W4MrMPBF4OtVIk/uoPml/OTOfAfwecGT92lOohuw4jGo89DMy83jgjcAr6zf6V1ON4nkC\nMFsfcjoXmMvME4BNwDMi4gkDbt81VENBfDciPhXVTTtuXbCNi7kJOG7A9Uj3mAGgrn05Mx+TmT8H\nHAM8NCJeveDfv7jIa04BLoqIG6jeYA+luvHKwqF2T6IaL4XM/F5mPjoz7wSeCZwWERcBz6O6Ycld\nwOeBLwOvAd5RH3I6BXhavZ5/BR4EPPqAWu42qm09SNdcve47M/OZwKOAv6UKmK/Wh4aWs3eFf5ca\n5yEg3Wsyc099+GbhTSsWu33dKLA1M28HiIgfp7oRzUkL5rmTBYEQ1a34vg9cTzVc9aeAG4GX1ut+\nRv2mfDrVSKZn1+t5ZWZ+uF7G/K35FrqNu4+xfkw9nYh4HvA/mflJqhvmXB4RF1Md0los3KC6Gcjf\nLfFvUmvcA9C9pj4Us4VqxMjlfJJ6tMOI+BmqN/IjDpjnM8Cv1fMcQ/WGfzzVOYY31M9PBw6JiKMj\n4uvATZn5Gqphvn+2Xs8LI+LQiNhANVrjLyxcST0877cj4qwFk18IfKJ+PAK8Puq7U0XEoVT3yP33\nBfMvDKozgZ/DANC9wD0AdWmO+hxA/Xw91afiNy4zP1T3T31Xfd5gBDi73nuYWzDPn1MNzX1j/fyl\nVGP1fwX4BtU5hauBJ2XmLRHxLuD6iNgL/DfwXqoheH+K6l6qhwLvWXA7v4WeQ/XJ/kKqr3LeSB1Q\nmXlVVLfm2xERd9X1/nVmXrng9ddExEz9eAJ4ambuWbptUjscDlqSCuUhIEkqlAEgSYUyACSpUAaA\nJBXKAJCkQhkAklQoA0CSCmUASFKh/h9YyAlZc1h0QAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x10a66dbe0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.distplot(deal_df,kde=False, axlabel='Briefcase USD')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**g.)** What gives a more **accurate measurement of the center** to describe the distribution of prices?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The **median** of $875 gives a better representation of the data. That is because median captures the amount distributed between most of the boxes. \n",
"\n",
"**General Rule for Skewed Distributions: Median is a better measurement of the center**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lesson 4 Variability: What is Standard Deviation?"
]
},
{
"cell_type": "code",
"execution_count": 227,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<img src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Standard_deviation_diagram.svg/2000px-Standard_deviation_diagram.svg.png\"/>"
],
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 227,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Image(url='https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Standard_deviation_diagram.svg/2000px-Standard_deviation_diagram.svg.png') "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Standard Deviation in A Nutshell\n",
" -*Standard deviation is the measure of spread*\n",
" \n",
"LaTeX url = 'http://latex.wikia.com/wiki/List_of_LaTeX_symbols'\n",
"\n",
"\n",
"$$ \\bar{x} = Sample \\: mean\\:\\: \\mu = Population \\: mean\\:\\: \\sigma = Standard\\: deviation\\:\\: n = Sample\\:size $$\n",
"\n",
"$$ Population\\: Standard\\: Deviation: \\:\\sigma = \\sqrt{\\frac{\\Sigma( Xi - \\mu )^2} n} $$\n",
"\n",
"$$ Sample\\: Standard\\: Deviation: \\:S = \\sqrt{\\frac{\\Sigma( Xi - \\bar{x} )^2} {n -1} } $$\n",
"\n",
"###Why the n-1 in sample standard deviation formula?\n",
"-The idea is to account for the lower distribution in sample sizes since they are mostly from the middle of the population\n",
"1. Deviation_from_mean = X[i] - mean(X)\n",
"2. Sum_of_Squares = Sum (Each deviation from mean squared)\n",
"3. Variance = mean(Sum_of_squares)\n",
"4. Standard_Deviation = sqrt(Variance)"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"17076.965197598784"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample = [33219, 36254, 38801, 46335, 46840, 47596, 55130, 56863, 78070, 88830]\n",
"\n",
"def standard_deviation(a_list):\n",
" X = np.mean(a_list)\n",
" n = len(a_list)\n",
" \n",
" avg_dev = []\n",
" for i in a_list:\n",
" avg_dev.append( (abs(i-X))**2 )\n",
" \n",
" \n",
" variance = np.mean(avg_dev)\n",
" standard_deviation = np.sqrt(variance)\n",
" return standard_deviation\n",
"\n",
"standard_deviation(sample)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The mean is of the average deviations is: 0.0\n"
]
}
],
"source": [
"# Deviation from mean list\n",
"b= [-19574.800000000003, -16539.800000000003, -13992.800000000003, -6458.8000000000029, -5953.8000000000029, \n",
" -5197.8000000000029, 2336.1999999999971, 4069.1999999999971, 25276.199999999997, 36036.199999999997]\n",
"\n",
"# Rounded deviation from the mean list\n",
"c = []\n",
"for i in b:\n",
" c.append(round(i,1))\n",
"\n",
"print (\"The mean is of the average deviations is: %0.1f\" %np.mean(c))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean of absolute deviations: $13543.56 \n",
"\n",
"1. Squared deviation of $19574.8: $383172795.04\n",
"2. Squared deviation of $16539.8: $273564984.04\n",
"3. Squared deviation of $13992.8: $195798451.84\n",
"4. Squared deviation of $6458.8: $41716097.44\n",
"5. Squared deviation of $5953.8: $35447734.44\n",
"6. Squared deviation of $5197.8: $27017124.84\n",
"7. Squared deviation of $2336.2: $5457830.44\n",
"8. Squared deviation of $4069.2: $16558388.64\n",
"9. Squared deviation of $25276.2: $638886286.44\n",
"10. Squared deviation of $36036.2: $1298607710.44\n",
"\n",
"The variance or average squared deviation is: $291622740.36 \n",
"\n",
"The standard deviation is: $17076.97\n",
"\n",
"Variance = average squared deviation\n",
"Standard_Deviation = sqrt(variance)\n"
]
}
],
"source": [
"# Absolute Deviation list and Sum of Squares list\n",
"abs_dev = []\n",
"ss = []\n",
"\n",
"for i in c:\n",
" abs_dev.append(abs(i))\n",
"print('Mean of absolute deviations: $%1.2f \\n' %np.mean(abs_dev))\n",
"\n",
"counter = 0\n",
"for i in abs_dev:\n",
" counter+=1\n",
" ss.append(np.square(i))\n",
" print('%s. Squared deviation of $%s: $%s' % (counter, i, np.square(i)))\n",
"\n",
"print('\\nThe variance or average squared deviation is: $%1.2f \\n' %(np.mean(ss)))\n",
"\n",
"variance = np.mean(ss)\n",
"standard_deviation = np.sqrt(variance)\n",
"print('The standard deviation is: $%1.2f' %standard_deviation)\n",
"print ()\n",
"print('Variance = average squared deviation')\n",
"print('Standard_Deviation = sqrt(variance)')"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"17076.965197598784"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def standard_deviation(a_list):\n",
" X = np.mean(a_list)\n",
" n = len(a_list)\n",
" \n",
" avg_dev = []\n",
" for i in a_list:\n",
" avg_dev.append( (abs(i-X))**2 )\n",
" \n",
" \n",
" variance = np.mean(avg_dev)\n",
" standard_deviation = np.sqrt(variance)\n",
" return standard_deviation\n",
"\n",
"standard_deviation(sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## OR"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"17076.965197598784"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(sample)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sample2 = [38946,43420,49191,50430,50557,52580,53595,54135,60181,62076]"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"6557.1632654677742"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(sample2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#ex 4. Social Networkers Salary"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"salary_df = pd.read_csv('downloads/Copy of Sample Social Networkers Salary n=100 - Lesson 4 - Sheet1.csv')"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>salary</th>\n",
" <th>2. deviation (x-xbar)</th>\n",
" <th>3. squared deviation (x-xbar)^2</th>\n",
" <th>Unnamed: 3</th>\n",
" <th>1. In this cell, find the average of cells a2 through a101.</th>\n",
" <th>4. In this cell, find the average of cells c2 through c101.</th>\n",
" <th>5. Finally, take the square root of the calculation described in cell f1.</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>59147.29</td>\n",
" <td>8560.9267</td>\n",
" <td>7.328947e+07</td>\n",
" <td>NaN</td>\n",
" <td>50586.3633</td>\n",
" <td>113570640.2</td>\n",
" <td>10656.95267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>61379.14</td>\n",
" <td>10792.7767</td>\n",
" <td>1.164840e+08</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>55683.19</td>\n",
" <td>5096.8267</td>\n",
" <td>2.597764e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>56272.76</td>\n",
" <td>5686.3967</td>\n",
" <td>3.233511e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>52055.88</td>\n",
" <td>1469.5167</td>\n",
" <td>2.159479e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" salary 2. deviation (x-xbar) 3. squared deviation (x-xbar)^2 \\\n",
"0 59147.29 8560.9267 7.328947e+07 \n",
"1 61379.14 10792.7767 1.164840e+08 \n",
"2 55683.19 5096.8267 2.597764e+07 \n",
"3 56272.76 5686.3967 3.233511e+07 \n",
"4 52055.88 1469.5167 2.159479e+06 \n",
"\n",
" Unnamed: 3 1. In this cell, find the average of cells a2 through a101. \\\n",
"0 NaN 50586.3633 \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" 4. In this cell, find the average of cells c2 through c101. \\\n",
"0 113570640.2 \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" 5. Finally, take the square root of the calculation described in cell f1. \n",
"0 10656.95267 \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"salary_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10656.952668536167"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(salary_df['salary'])"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The mean: $50586.36\n",
"1 std above the mean: $61243.32\n",
"1 std below the mean: $39929.41\n"
]
}
],
"source": [
"x = np.mean(salary_df['salary'])\n",
"above_std = x + np.std(salary_df['salary'])\n",
"below_std = x - np.std(salary_df['salary'])\n",
"print (\"The mean: $%1.2f\" %x)\n",
"print (\"1 std above the mean: $%1.2f\" %above_std)\n",
"print (\"1 std below the mean: $%1.2f\" %below_std)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2.4545246704860579"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std([18,20,23,18,21,15,17,22,21])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#ex 5. Udacians Facebook Friends & More Standard Deviations"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"friends_df = pd.read_csv('downloads/How many Facebook friends do Udacians have%3F - Lesson 3 - Sheet1.csv')"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Timestamp</th>\n",
" <th>How many Facebook friends do you have?</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1/2/2013 15:19:35</td>\n",
" <td>589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1/2/2013 15:19:48</td>\n",
" <td>241</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1/2/2013 15:20:15</td>\n",
" <td>1116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1/2/2013 15:20:50</td>\n",
" <td>69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1/2/2013 15:21:11</td>\n",
" <td>1214</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Timestamp How many Facebook friends do you have?\n",
"0 1/2/2013 15:19:35 589\n",
"1 1/2/2013 15:19:48 241\n",
"2 1/2/2013 15:20:15 1116\n",
"3 1/2/2013 15:20:50 69\n",
"4 1/2/2013 15:21:11 1214"
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"friends_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"584.7407407407408"
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(friends_df['How many Facebook friends do you have?'])"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"859.0\n",
"0.0314318975553\n",
"Squared sum: 4978411.18519\n",
"Variance: 184385.6\n",
"Population Standard Deviation: 429.4\n",
"One standard deviation above the mean friends: 1014.0\n",
"One standard deviation below the mean friends: 155.0\n",
"Proportion in between +1 std and -1 std: 31.8148\n"
]
}
],
"source": [
"q = []\n",
"for i in friends_df['How many Facebook friends do you have?']:\n",
" q.append(i - np.mean(friends_df['How many Facebook friends do you have?']))\n",
"\n",
"mean = np.mean(friends_df['How many Facebook friends do you have?'])\n",
"ss = sum(np.square(q))\n",
"variance = ss/len(friends_df)\n",
"std = np.std(friends_df['How many Facebook friends do you have?'])\n",
"in_between_one_dev = round(mean + std,0) - round(mean - std,0)\n",
"print (in_between_one_dev)\n",
"print ((friends_df['How many Facebook friends do you have?'].count() / in_between_one_dev))\n",
"proportion_in_between = round(in_between_one_dev / friends_df['How many Facebook friends do you have?'].count(),4)\n",
"\n",
"print (\"Squared sum:\", ss)\n",
"print (\"Variance:\", round(variance,2))\n",
"print (\"Population Standard Deviation:\", round(std,2))\n",
"print (\"One standard deviation above the mean friends:\", round(mean + std,0) )\n",
"print (\"One standard deviation below the mean friends:\", round(mean - std,0) )\n",
"print (\"Proportion in between +1 std and -1 std:\", proportion_in_between)"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<img src=\"https://lh6.ggpht.com/yOVq2JqeURBH8PKBRG33n9ngtvrAmR7wb-cY17PS025xSHYPjGZB1d2P9PWe-7jjpDf7PLlNCEp1CD-Vl2U=s889#w=1920&h=1080\"/>"
],
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Image(url='https://lh6.ggpht.com/yOVq2JqeURBH8PKBRG33n9ngtvrAmR7wb-cY17PS025xSHYPjGZB1d2P9PWe-7jjpDf7PLlNCEp1CD-Vl2U=s889#w=1920&h=1080')"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Timestamp</th>\n",
" <th>How many Facebook friends do you have?</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1/2/2013 15:19:35</td>\n",
" <td>589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1/2/2013 15:19:48</td>\n",
" <td>241</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1/2/2013 15:21:24</td>\n",
" <td>784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1/2/2013 15:26:34</td>\n",
" <td>408</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1/2/2013 15:39:28</td>\n",
" <td>777</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1/2/2013 15:46:07</td>\n",
" <td>822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1/2/2013 15:56:00</td>\n",
" <td>555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1/2/2013 16:17:02</td>\n",
" <td>366</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>1/2/2013 16:23:35</td>\n",
" <td>863</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1/2/2013 16:28:28</td>\n",
" <td>258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1/2/2013 16:30:23</td>\n",
" <td>376</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>1/2/2013 16:35:58</td>\n",
" <td>256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1/2/2013 16:53:47</td>\n",
" <td>850</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>1/2/2013 16:59:09</td>\n",
" <td>240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>1/2/2013 17:56:35</td>\n",
" <td>479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>1/2/2013 20:04:27</td>\n",
" <td>174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1/2/2013 23:18:28</td>\n",
" <td>322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1/3/2013 10:20:51</td>\n",
" <td>600</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Timestamp How many Facebook friends do you have?\n",
"0 1/2/2013 15:19:35 589\n",
"1 1/2/2013 15:19:48 241\n",
"5 1/2/2013 15:21:24 784\n",
"6 1/2/2013 15:26:34 408\n",
"8 1/2/2013 15:39:28 777\n",
"9 1/2/2013 15:46:07 822\n",
"10 1/2/2013 15:56:00 555\n",
"12 1/2/2013 16:17:02 366\n",
"13 1/2/2013 16:23:35 863\n",
"15 1/2/2013 16:28:28 258\n",
"16 1/2/2013 16:30:23 376\n",
"17 1/2/2013 16:35:58 256\n",
"18 1/2/2013 16:53:47 850\n",
"20 1/2/2013 16:59:09 240\n",
"21 1/2/2013 17:56:35 479\n",
"23 1/2/2013 20:04:27 174\n",
"24 1/2/2013 23:18:28 322\n",
"25 1/3/2013 10:20:51 600"
]
},
"execution_count": 223,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Defining standard deviations\n",
"one_std_above = mean + std\n",
"one_std_below = mean - std\n",
"\n",
"# Filter the data\n",
"in_between = friends_df[(friends_df['How many Facebook friends do you have?'] > one_std_below) & \\\n",
" (friends_df['How many Facebook friends do you have?'] < one_std_above)]\n",
"\n",
"in_between"
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In between the deviations, the proportion is: 0.67\n",
"In between the deviations, the proportion percentage is: 66.67%\n"
]
}
],
"source": [
"sandwich_proportion = in_between['How many Facebook friends do you have?'].count()/ \\\n",
"friends_df['How many Facebook friends do you have?'].count()\n",
"\n",
"print (\"In between the deviations, the proportion is: %1.2f\" %(sandwich_proportion))\n",
"print (\"In between the deviations, the proportion percentage is: %1.2f%%\" %(sandwich_proportion * 100))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"answer.) **This is about exactly what we expected as it is between one standard deviation. That is where 68.2% of our data lies.**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**b.)** What is the **sample standard deviation as per bessel's correction**?"
]
},
{
"cell_type": "code",
"execution_count": 238,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"437.58125334199735"
]
},
"execution_count": 238,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# ddof = 1 takes into account the bessel correction for sample standard deviation.\n",
"np.std(friends_df['How many Facebook friends do you have?'], ddof=1)"
]
},
{
"cell_type": "code",
"execution_count": 240,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"437.58125334199735"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def bessel_standard_deviation(a_list):\n",
" X = np.mean(a_list)\n",
" n = len(a_list)\n",
" \n",
" avg_dev = []\n",
" for i in a_list:\n",
" avg_dev.append( (abs(i-X))**2 )\n",
" \n",
" \n",
" variance = (sum(avg_dev)) / (len(avg_dev) - 1)\n",
" bessel_standard_deviation = np.sqrt(variance)\n",
" return bessel_standard_deviation\n",
"\n",
"bessel_standard_deviation(friends_df['How many Facebook friends do you have?'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*My bessel_standard_deviation function proves that **ddof = 1** passed as argument to the np.std() function is equivalent to the bessel's correction equation for sample standard deviation*\n",
"\n",
"$$ S = \\sqrt{ \\frac{ \\Sigma(X[i] - \\bar{x})^2 } {n - \\bf{ddof} } }$$ "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### More BBC exercises"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**c.)** What is the **Standard deviation** of both the recognition scores and temporal scores?"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Recognition Score</th>\n",
" <th>Temporal Memory Score</th>\n",
" <th>Average Recognition</th>\n",
" <th>Average Temporal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>91</td>\n",
" <td>86</td>\n",
" <td>93.115556</td>\n",
" <td>78.026667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>95</td>\n",
" <td>78</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Recognition Score Temporal Memory Score Average Recognition \\\n",
"0 91 86 93.115556 \n",
"1 95 78 NaN \n",
"\n",
" Average Temporal \n",
"0 78.026667 \n",
"1 NaN "
]
},
"execution_count": 243,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"memory_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 246,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Recognition Score 22.568709\n",
"Temporal Memory Score 12.779817\n",
"Average Recognition 0.000000\n",
"Average Temporal 0.000000\n",
"dtype: float64"
]
},
"execution_count": 246,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.std(memory_df, ddof = 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note regarding **<em>VARIABILITY</em>**: *I notice that recognition score has a higher variability than the temporal memory score*. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Lesson 5: Standardization & Z-Scores"
]
},
{
"cell_type": "code",
"execution_count": 259,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<img src=\"https://lh5.ggpht.com/M5oM3YIdxVv2Sy8BE9lQMN4JFDZiYYAB2icCmAVRgaSY5Wuo9Nfq-vNxed0_BgLt-iEcRyoqqYDkUFBVHeY=s889#w=1920&h=1080\"/>"
],
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 259,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Image(url='https://lh5.ggpht.com/M5oM3YIdxVv2Sy8BE9lQMN4JFDZiYYAB2icCmAVRgaSY5Wuo9Nfq-vNxed0_BgLt-iEcRyoqqYDkUFBVHeY=s889#w=1920&h=1080')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Well, first let's identify the variables\n",
"$$ \\:Mean\\:of\\:Facebook\\:\\mu F=190 \\: \\\\\\:Mean\\:of\\:Twitter\\:\\mu T=208 \\:\\ \\\\\\:STD\\:of\\:Twitter\\:\\sigma T=60 \\: \\\\\\:STD\\:of\\:Facebook\\:\\sigma F=36 \\:$$\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To answer how many standard deviations away is Katie's number of facebook friends we have to:\n",
"1. Identify the number of her friends (63)\n",
"2. Determine the difference of her friends and how many friends a facebook user has on average (190 - 63 = 127)\n",
"3. Divide the difference by the standard deviation (127 / 36 = ~3.5)\n",
"\n",
"**Conclusion**: Katie is **3.5 standard deviations below** the mean number of facebook friends."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###Q.) How many standard deviations is Andy's number of Twitter followers from the mean number of Twitter followers?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To answer how many standard deviations away is Andys's number of twitter followers we have to:\n",
"1. Identify the number of his followers (54)\n",
"2. Determine the difference of her friends and how many followers a twitter user has on average (208 - 54 = 154)\n",
"3. Divide the difference by the standard deviation (154 / 60 = ~2.6)\n",
"\n",
"**Conclusion**: Andy is **2.6 standard deviations below** the mean number of twitter followers.\n",
"\n",
"Summarizing those steps gives us the z score: $$ z = \\frac{x - \\mu}\\sigma $$\n",
"**Restating our conclusion**: Andy has a -2.6 z score\n",
"\n",
"Z-score describes how many standard deviations, above or below, something is. This is a very good way to standardize any normal distribution.\n",
"\n",
"A useful way to standardize normal distribution is illustrated by the image below:"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<img src=\"https://www.mathsisfun.com/data/images/normal-distrubution-large.gif\"/>"
],
"text/plain": [
"<IPython.core.display.Image object>"
]
},
"execution_count": 266,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Standard Normal Distribution\n",
"Image(url = 'https://www.mathsisfun.com/data/images/normal-distrubution-large.gif')"
]
},
{
"cell_type": "code",
"execution_count": 275,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"400\"\n",
" height=\"300\"\n",
" src=\"https://www.youtube.com/embed/pftNHu4uevA?end=36&start=30\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.YouTubeVideo at 0x104f0f1d0>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" <iframe\n",
" width=\"400\"\n",
" height=\"300\"\n",
" src=\"https://www.youtube.com/embed/5B-vrhyedws\"\n",
" frameborder=\"0\"\n",
" allowfullscreen\n",
" ></iframe>\n",
" "
],
"text/plain": [
"<IPython.lib.display.YouTubeVideo at 0x104f0f438>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"(None, None)"
]
},
"execution_count": 275,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Z-Score Udacity Lesson 5: Standardizing Z-Score\n",
"zscore_video = YouTubeVideo(\"pftNHu4uevA\", start=30, end=36)\n",
"standard_normal_distribution_video = YouTubeVideo('5B-vrhyedws')\n",
"display(zscore_video), display(standard_normal_distribution_video)"
]
},
{
"cell_type": "code",
"execution_count": 252,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# How to use the z score\n",
"\n",
"from scipy import stats\n",
"# stats.zscore(a, axis=0, ddof=0)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"**Side note on a sweet list comprehension solution to vowel_remover function"
]
},
{
"cell_type": "code",
"execution_count": 312,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'hll by'"
]
},
"execution_count": 312,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def vowel_remover(word):\n",
" word = word.lower()\n",
" vowels = ('a','e','i','o','u')\n",
" \n",
" # declare [variable] for [variable] in [something] if [variable] not in vowels\n",
" # print x for letter x in word unless it is a vowel in which case, join it with an empty string\n",
" return (''.join([x for x in word if x not in vowels]))\n",
"\n",
"h = 'HELLO BOY'\n",
"vowel_remover(h)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment