Created
June 6, 2020 03:36
-
-
Save AllieUbisse/4676dd20f33196de61f7e0750ab80c44 to your computer and use it in GitHub Desktop.
Outlier-Detection-3WAYS-Methods.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Outlier-Detection-3WAYS-Methods.ipynb", | |
"provenance": [], | |
"collapsed_sections": [ | |
"Y41mCGlZAk-s", | |
"lqLrIC7RBVVc", | |
"NQmEUGBABmwb", | |
"jwZuNSz0VG09", | |
"VvY2jmV4ZM6r", | |
"QTCBRuExaiqM", | |
"X8C_qMutc1V5" | |
], | |
"authorship_tag": "ABX9TyMhPr928L/rLGxIl7e68RkB", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/AllieUbisse/4676dd20f33196de61f7e0750ab80c44/outlier-detection-3ways-methods.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Y41mCGlZAk-s", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# **1. Library Imports**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ODlzYMLK_6kL", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from scipy.stats import norm\n", | |
"\n", | |
"import matplotlib.pyplot as plt\n", | |
"%matplotlib inline\n", | |
"import seaborn as sns\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "lqLrIC7RBVVc", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# **2. Import Dataset**\n", | |
"---\n", | |
"\n", | |
"source: [New York City Airbnb Open Data](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "npW-YRWCA9AW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"bnb = pd.read_csv('/content/AB_NYC_2019.csv')\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "NQmEUGBABmwb", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# **3. Outlier Detection using percentile BnB**\n", | |
"---\n", | |
"Suppose price is defined as price per night" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ck-e-iC6BxsT", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 401 | |
}, | |
"outputId": "7be52dfb-d49f-40ba-9101-996d84a70834" | |
}, | |
"source": [ | |
"bnb.head()" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>name</th>\n", | |
" <th>host_id</th>\n", | |
" <th>host_name</th>\n", | |
" <th>neighbourhood_group</th>\n", | |
" <th>neighbourhood</th>\n", | |
" <th>latitude</th>\n", | |
" <th>longitude</th>\n", | |
" <th>room_type</th>\n", | |
" <th>price</th>\n", | |
" <th>minimum_nights</th>\n", | |
" <th>number_of_reviews</th>\n", | |
" <th>last_review</th>\n", | |
" <th>reviews_per_month</th>\n", | |
" <th>calculated_host_listings_count</th>\n", | |
" <th>availability_365</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2539</td>\n", | |
" <td>Clean & quiet apt home by the park</td>\n", | |
" <td>2787</td>\n", | |
" <td>John</td>\n", | |
" <td>Brooklyn</td>\n", | |
" <td>Kensington</td>\n", | |
" <td>40.64749</td>\n", | |
" <td>-73.97237</td>\n", | |
" <td>Private room</td>\n", | |
" <td>149</td>\n", | |
" <td>1</td>\n", | |
" <td>9</td>\n", | |
" <td>2018-10-19</td>\n", | |
" <td>0.21</td>\n", | |
" <td>6</td>\n", | |
" <td>365</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2595</td>\n", | |
" <td>Skylit Midtown Castle</td>\n", | |
" <td>2845</td>\n", | |
" <td>Jennifer</td>\n", | |
" <td>Manhattan</td>\n", | |
" <td>Midtown</td>\n", | |
" <td>40.75362</td>\n", | |
" <td>-73.98377</td>\n", | |
" <td>Entire home/apt</td>\n", | |
" <td>225</td>\n", | |
" <td>1</td>\n", | |
" <td>45</td>\n", | |
" <td>2019-05-21</td>\n", | |
" <td>0.38</td>\n", | |
" <td>2</td>\n", | |
" <td>355</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3647</td>\n", | |
" <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n", | |
" <td>4632</td>\n", | |
" <td>Elisabeth</td>\n", | |
" <td>Manhattan</td>\n", | |
" <td>Harlem</td>\n", | |
" <td>40.80902</td>\n", | |
" <td>-73.94190</td>\n", | |
" <td>Private room</td>\n", | |
" <td>150</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>365</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3831</td>\n", | |
" <td>Cozy Entire Floor of Brownstone</td>\n", | |
" <td>4869</td>\n", | |
" <td>LisaRoxanne</td>\n", | |
" <td>Brooklyn</td>\n", | |
" <td>Clinton Hill</td>\n", | |
" <td>40.68514</td>\n", | |
" <td>-73.95976</td>\n", | |
" <td>Entire home/apt</td>\n", | |
" <td>89</td>\n", | |
" <td>1</td>\n", | |
" <td>270</td>\n", | |
" <td>2019-07-05</td>\n", | |
" <td>4.64</td>\n", | |
" <td>1</td>\n", | |
" <td>194</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5022</td>\n", | |
" <td>Entire Apt: Spacious Studio/Loft by central park</td>\n", | |
" <td>7192</td>\n", | |
" <td>Laura</td>\n", | |
" <td>Manhattan</td>\n", | |
" <td>East Harlem</td>\n", | |
" <td>40.79851</td>\n", | |
" <td>-73.94399</td>\n", | |
" <td>Entire home/apt</td>\n", | |
" <td>80</td>\n", | |
" <td>10</td>\n", | |
" <td>9</td>\n", | |
" <td>2018-11-19</td>\n", | |
" <td>0.10</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id ... availability_365\n", | |
"0 2539 ... 365\n", | |
"1 2595 ... 355\n", | |
"2 3647 ... 365\n", | |
"3 3831 ... 194\n", | |
"4 5022 ... 0\n", | |
"\n", | |
"[5 rows x 16 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ShT93U79HgPK", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 287 | |
}, | |
"outputId": "d1402857-e666-4975-a9dd-d59817420d2b" | |
}, | |
"source": [ | |
"bnb[['price', 'minimum_nights','number_of_reviews', 'reviews_per_month']].describe()" | |
], | |
"execution_count": 56, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>price</th>\n", | |
" <th>minimum_nights</th>\n", | |
" <th>number_of_reviews</th>\n", | |
" <th>reviews_per_month</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>48895.000000</td>\n", | |
" <td>48895.000000</td>\n", | |
" <td>48895.000000</td>\n", | |
" <td>38843.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>152.720687</td>\n", | |
" <td>7.029962</td>\n", | |
" <td>23.274466</td>\n", | |
" <td>1.373221</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>240.154170</td>\n", | |
" <td>20.510550</td>\n", | |
" <td>44.550582</td>\n", | |
" <td>1.680442</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.010000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>69.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.190000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>106.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>5.000000</td>\n", | |
" <td>0.720000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>175.000000</td>\n", | |
" <td>5.000000</td>\n", | |
" <td>24.000000</td>\n", | |
" <td>2.020000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>10000.000000</td>\n", | |
" <td>1250.000000</td>\n", | |
" <td>629.000000</td>\n", | |
" <td>58.500000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" price minimum_nights number_of_reviews reviews_per_month\n", | |
"count 48895.000000 48895.000000 48895.000000 38843.000000\n", | |
"mean 152.720687 7.029962 23.274466 1.373221\n", | |
"std 240.154170 20.510550 44.550582 1.680442\n", | |
"min 0.000000 1.000000 0.000000 0.010000\n", | |
"25% 69.000000 1.000000 1.000000 0.190000\n", | |
"50% 106.000000 3.000000 5.000000 0.720000\n", | |
"75% 175.000000 5.000000 24.000000 2.020000\n", | |
"max 10000.000000 1250.000000 629.000000 58.500000" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 56 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9m2kReN0DNzB", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 296 | |
}, | |
"outputId": "232ed44d-f219-4703-dfbf-081fc9f6c2dd" | |
}, | |
"source": [ | |
"sns.boxplot(x='price', data=bnb)" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7ff28eb262b0>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 6 | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWEAAAEGCAYAAAC0DiQ1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAASKUlEQVR4nO3df4zUdX7H8deb3RX5cSqiMRyeXc2IgdS0J2sCtmk2FoTdbcQ/NOU07tpWKNAA1ZgGvU1cGv/w2qYpR9vj9KxCS09be6lGBANXNVqC5+4VVqsio8dVUOoyaSUeXV3WT/+Yz8x9Z5iBHXZm38Pu85Fs+M7n+5nv5/35foYX3/3O7mAhBAEAfEzyLgAAJjJCGAAcEcIA4IgQBgBHhDAAOGqspPNll10Wmpuba1QKAIxPfX19x0MIl5faV1EINzc3q7e3tzpVAcAEYWY/L7eP2xEA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADiq6P+YG63Nmzfr1VdflSTNnj1bqVRKa9euHcsSAKCujGkIp9NpDRzPSA2NGvifE2M5NADUpTENYUlSQ6OGp84c82EBoB5xTxgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABw1jsUgmzdvrqjf2rVra1kOANSNMQnhdDpd1X4AMF5wOwIAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjuoqhA8cOKADBw6otbW1Zl+LFi3Kb3d0dBTsW7x4sZYsWaIlS5bojjvuKNj3+OOPq7W1VU888YQ6Ozvz7cuWLVNra6uef/55rVixQm1tberr68vvX7JkidLptDo6OpROp5XJZLRy5cp8v9WrV2vNmjVKp9Nat26dMpmMHnroIbW2turWW29Vb2+v2tratHLlynyfdDpd8Lw1a9Zo9erVymQyeuCBB9Ta2qoNGzYUnNtMJlPQr5xcrcnacv2T+3K1ZjKZ/HZvb69uvvlm9fX15dvT6bTuvfdetbe3K51On3XcM/UpZ/v27WptbdXTTz9dUE9y7sVt1VJp3eXO10j6V0u5mkcy1mjrqeVa1Ery9VULDT09PSPu/Nhjj/WsXLmy4kF27dqV3z726YBC01RJ0tdnTFdbW1t+31NPPVXxsSv11Vdf5beHhoZO2zc8PKzh4WGdPHmyYN9bb70lServ79dnn32Wb//iiy8kSfv27VMmk9GpU6e0d+9effnll5Kk4eFh9ff3a2BgQP39/Tp27Jj27t2b7/fxxx/n973//vsaHBzUnj178sfeu3evTp48qUwmk+/T39+vdDqdf96hQ4d0/PhxffHFF3r55ZclSUeOHNE999yTr3PLli16/fXX8/0WLlxY8vzcf//9GhgYKKgt1z+578iRIxocHNT+/fv12muvaXBwUE8++WS+5hMnTui1115Tf3+/PvzwQw0NDam/v1+33XbbGcc9U59y1q1bJ0nq7e3V4OBgvp7cHLds2XJaW7VUWneyluT5uvPOO8/av1q1l6t5JGONtp5arkWtJF9fyb9Tldi4ceMnPT09j5XaVzdXwq2trd4ljEoIIb/9+eefF+w7fPhw/s8dO3aU7Hf48GGFEPTcc88VPLdUn9zxkseWdNpzc1fDmUym4B/CnTt3lrwSSafT+eMlx925c6f6+voK9oUQtHPnTu3atUshBL344ov553z++ed68cUXS9Za6ooxOW65PuVs37694PGOHTsUQtCuXbvyV+q5GnNt1VJp3claduzYUXC+Sl0N16L2cjWPZKzR1lPLtaiV4tdXLa6GxySEjx49qnQ6nV3wr4azAw+eUDqd1vr167V+/fqxKKMuDA8Pj9lY+/btkyRt3bq14Kp/aGhI27ZtO63/I488UvI4Q0NDevjhh0u2545b/F3FqVOnSh6r1BjFbeXqKOXxxx8veJw7v8PDw9q2bZu2bt2a/+4n11YtldadrKX4/JQ6v7WovVzNIxlrtPXUci1qpfj1tWXLlqqPcdYQNrOVZtZrZr0DAwNVLwC1t2fPnoIr9RCCdu/efVq/5FVrUgjhtKv7XHvyuCNRaozitnJ1VOLUqVPavXu39uzZkw+8XFu1VFp3spZipc5vLWovV/NIxhptPbVci/PZWUM4hPBYCKElhNBy+eWXn9Mgs2fPViqVUiqVkiY1SJK+uvAipVIpbdq0SZs2bTqn42JkFi1aJDPLPzYzLV68+LR+zc3NJZ9vZpo+fXrJ9uRxR6LUGMVt5eqoRGNjoxYvXqxFixapsbGxoK1aKq07WUuxUue3FrWXq3kkY422nlquxfmsbu4JTxQNDQ1jNtaCBQskSV1dXWpqasq3NzU1qbOz87T+3d3dJY/T1NSkjRs3lmzPHTd5fEllw6bUGMVt5eooZcWKFQWPc+e3oaFBnZ2d6urq0qRJkwraqqXSupO1FJ+fUue3FrWXq3kkY422nlquRa0Uv75WrVpV9THqJoRfeeUV7xJGJXlFWHxVk7vaaG5uVkdHR8l+zc3NMjMtW7as4Lml+iSvZpLbxc999NFHJUkzZ87U0qVL8+1tbW2aOXPmaXNIpVL54yXHbWtr0/z58wv2mZna2tq0dOlSmZna29vzz5k+fbra29tL1ppKpc44brk+5dx1110Fjzs6OmRmWrp0qWbOnJmfe7KtWiqtO1lLR0dHwfmaP3/+GftXq/ZyNY9krNHWU8u1qJXi19fy5curPkbdhPBYSV6BTJs2rWBfU1OTJk+erMmTJ6v41ktuMe6++25dddVV+faLL75YknTffffp2muv1ZQpUwquaiZPnqzu7m5NmzZN3d3d6urq0pw5c/L95s6dq3nz5qm7u1vXX3+9Ojs7ddNNN0mSLrroIvX09GjKlCmaM2dOvk93d3fB8+bNm6e5c+eqs7NTLS0tkn55FZzT1dVV0K+cXK3J2nL9k/tytXZ1deW3e3p6NGnSJG3cuDHf3t3drVQqpalTp57xSjF5jiqVu1pZtWpVQT3JuRe3VUuldZc7XyPpXy3lah7JWKOtp5ZrUSvJ11ctWCVvrLS0tITe3t6KB0n+9MP+t9/R8NTsv4Dzr7mi4H5wrh/3iAGMJ2bWF0JoKbVvwl0JA0A9IYQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcNY7FIKlUSpKUTqdH1A8AJooxCeG1a9dKktavXz+ifgAwUXA7AgAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCIEAYAR4QwADgihAHAESEMAI4IYQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4CjxjEfcfiUGk5mJJmkK8Z8eACoJ2MawqlUSkePHpUkzZ49W6lUaiyHB4C6YyGEEXduaWkJvb29NSwHAMYfM+sLIbSU2sc9YQBwRAgDgCNCGAAcEcIA4IgQBgBHhDAAOCKEAcARIQwAjghhAHBECAOAI0IYABwRwgDgiBAGAEeEMAA4IoQBwBEhDACOCGEAcEQIA4AjQhgAHBHCAOCoov/o08wGJP38HMe6TNLxc3zu+Yo5TwwTbc4Tbb7S6Of8KyGEy0vtqCiER8PMesv9b6PjFXOeGCbanCfafKXazpnbEQDgiBAGAEdjGcKPjeFY9YI5TwwTbc4Tbb5SDec8ZveEAQCn43YEADgihAHAUc1D2MyWmtlBM0ub2YZaj1dLZvYNM3vZzN4xs/80s/Wx/VIz221mh+KfM2K7mdl349z7zeyGxLG6Yv9DZtblNaeRMrMGM/sPM3shPr7azN6Ic3vGzC6I7ZPj43Tc35w4xoOx/aCZLfGZyciY2SVm9qyZvWdm75rZwvG+zmZ2X3xdv21mPzSzC8fbOpvZ35nZp2b2dqKtautqZvPN7K34nO+amZ21qBBCzb4kNUj6QNI1ki6QdEDSvFqOWeP5zJJ0Q9z+mqT3Jc2T9GeSNsT2DZK+E7fbJe2UZJIWSHojtl8q6cP454y4PcN7fmeZ+/2S/lHSC/HxP0laHre3SFodt9dI2hK3l0t6Jm7Pi+s/WdLV8XXR4D2vM8x3q6R74/YFki4Zz+ssabakn0makljfe8bbOkv6LUk3SHo70Va1dZX0k9jX4nPbzlpTjSe8UNJLiccPSnrQeyGqOL/nJC2WdFDSrNg2S9LBuP19Sd9K9D8Y939L0vcT7QX96u1L0pWSfizpZkkvxBfYcUmNxess6SVJC+N2Y+xnxWuf7FdvX5IujoFkRe3jdp1jCH8Ug6UxrvOS8bjOkpqLQrgq6xr3vZdoL+hX7qvWtyNyC5tzJLad9+K3X9+U9IakK0IIn8RdxyRdEbfLzf98Oy9/JelPJH0VH8+U9L8hhFPxcbL+/Nzi/s9i//NpzldLGpD0ZLwF8wMzm6ZxvM4hhKOS/kLSf0n6RNl169P4Xuecaq3r7Lhd3H5GvDF3DsxsuqR/kfTHIYQTyX0h+0/guPm5PzP7HUmfhhD6vGsZQ43Kfsv6vRDCNyX9QtlvU/PG4TrPkLRM2X+Avi5pmqSlrkU58FjXWofwUUnfSDy+Mradt8ysSdkA3h5C+FFs/m8zmxX3z5L0aWwvN//z6bz8hqRbzeywpKeVvSWxSdIlZtYY+yTrz88t7r9YUkbn15yPSDoSQngjPn5W2VAez+u8SNLPQggDIYQhST9Sdu3H8zrnVGtdj8bt4vYzqnUIvynp2vgO6wXK3sB/vsZj1kx8p/MJSe+GEP4yset5Sbl3SLuUvVeca++M77IukPRZ/LbnJUm3mNmMeAVyS2yrOyGEB0MIV4YQmpVdv38LIdwl6WVJt8duxXPOnYvbY/8Q25fHd9WvlnStsm9i1J0QwjFJH5nZdbHptyW9o3G8zsrehlhgZlPj6zw353G7zglVWde474SZLYjnsDNxrPLG4CZ4u7I/RfCBpG9735Qf5Vx+U9lvVfol7Y9f7creC/uxpEOS9ki6NPY3SX8T5/6WpJbEsX5fUjp+/Z733EY4/1b98qcjrlH2L1da0j9LmhzbL4yP03H/NYnnfzuei4MawbvGznP9dUm9ca3/Vdl3wcf1OkvaKOk9SW9L+ntlf8JhXK2zpB8qe897SNnveP6gmusqqSWevw8k/bWK3twt9cWvLQOAI96YAwBHhDAAOCKEAcARIQwAjghhAHBECOO8Z2Z/amaLvOsAzgU/oobzmpk1hBCGvesAzhVXwqhbZtYcP893e/xM32fjb3QdNrPvmNlPJd1hZk+Z2e3xOTea2V4zO2BmPzGzr1n2s5D/3MzejJ8L+4fOUwPyCGHUu+sk/W0IYa6kE8p+jq0kZUIIN4QQns51jL8a/4yk9SGEX1P28xD+T9nfivoshHCjpBslrYi/Ugu4I4RR7z4KIfx73P4HZX91XMqGbbHrJH0SQnhTkkIIJ0L2YxZvUfYzAPYr+9GjM5X9TAPAXePZuwCuit+0yD3+RQXHMElrQwj1+uE5mMC4Eka9u8rMFsbtOyW9foa+ByXNMrMbJSneD25U9lOvVsePIZWZzYkf0g64I4RR7w5K+iMze1fZTzL7XrmOIYQvJf2upM1mdkDSbmU/7esHyn4s40/jf/D4ffFdIOoEP6KGuhX/C6kXQgi/6lwKUDNcCQOAI66EAcARV8IA4IgQBgBHhDAAOCKEAcARIQwAjv4f/hBNri7JG6gAAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "74O8VjkXJf8T", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 296 | |
}, | |
"outputId": "871b0ff0-a8a3-451f-e045-e73d06db1eed" | |
}, | |
"source": [ | |
"sns.distplot(bnb.price, bins=10, )" | |
], | |
"execution_count": 97, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7ff28dc05588>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 97 | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6YWWbaXfDrYx", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "11bedaf4-d910-4e35-bf99-b7f993a897cb" | |
}, | |
"source": [ | |
"# set threshold values\n", | |
"min_price_threshold, max_price_treshold = bnb.price.quantile([0.01, 0.90])\n", | |
"min_price_threshold, max_price_treshold" | |
], | |
"execution_count": 90, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(30.0, 269.0)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 90 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Vsbg4MN9ERnS", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "84c31a19-936b-4ca0-b117-72684d42ef2d" | |
}, | |
"source": [ | |
"# count the price occurance\n", | |
"bnb[bnb.price<min_price_threshold].price.value_counts().sum()" | |
], | |
"execution_count": 91, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"404" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 91 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8uMQU4srFSHf", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "53647172-8f60-46f8-c1cb-0fb53a5b966f" | |
}, | |
"source": [ | |
"# count the price occurance\n", | |
"bnb[bnb.price>max_price_treshold].price.value_counts().sum()" | |
], | |
"execution_count": 92, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"4878" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 92 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-PLMXlcBGFgO", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"bnb_no_outliers = bnb[(bnb.price>min_price_threshold) & (bnb.price<max_price_treshold) ]" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gMKUAoMYGrr-", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "1fd82e55-3b9a-4041-f2f4-2ce558818a5c" | |
}, | |
"source": [ | |
"bnb.shape, bnb_no_outliers.shape" | |
], | |
"execution_count": 94, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"((48895, 16), (43325, 16))" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 94 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nPgHbSDUG3kI", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 296 | |
}, | |
"outputId": "d51e32ba-ebeb-496d-d0e6-b3ab0a9c0039" | |
}, | |
"source": [ | |
"sns.boxplot(x='price', data=bnb_no_outliers)" | |
], | |
"execution_count": 95, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7ff28d8a1a90>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 95 | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAEGCAYAAABbzE8LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAJ+klEQVR4nO3dX4jl91nH8c+TXW03GrHpliWM1W0drYqgLhupUHpVlOYmCoriRXshVvwzrBdeVHpTvKuiEAcVoxarFiv4B4tUahVBtNh2N2zStJvYo7bUIU3SBpLirq1Nv16c3+IwzGx2kjPnmTPn9YJhzv7m7JznO/vdN+f8zsyZGmMEgOW7o3sAgHUlwABNBBigiQADNBFggCanD3Pls2fPjvPnzx/RKAAn05UrVz4/xnjV3uOHCvD58+dz+fLlxU0FsAaq6jP7HXcKAqCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCaH+p1w62J7ezuz2ax7jEPZ2dlJkmxsbDRP0mdzczNbW1vdY8BtE+B9zGazXH30Wp6/8+7uUW7bqevPJkk+96X1/Cc9df2Z7hHg0Nbzf+tteP7Ou3PjO+7rHuO2nXnsA0myUjMv0s31wypxDhigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoMlSAry9vZ3t7e1l3BTAQh1lv04fyWfdYzabLeNmABbuKPvlFARAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQJPTy7iRnZ2d3LhxI5cuXVrGzb1ks9ksd3x5dI/BIdzxP89lNvviyuwxVsdsNsuZM2eO5HO/4D3gqnpbVV2uqstPP/30kQwBsI5e8B7wGOPBJA8mycWLF1/U3cKNjY0kyQMPPPBi/vrSXbp0KVf+48nuMTiEr778G7L52nMrs8dYHUf5qMo5YIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0OT0Mm5kc3NzGTcDsHBH2a+lBHhra2sZNwOwcEfZL6cgAJoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0ESAAZoIMEATAQZoIsAATQQYoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNDkdPcAx9Wp68/kzGMf6B7jtp26/oUkWamZF+nU9WeSnOseAw5FgPexubnZPcKh7ex8JUmysbGuETq3kv9urDcB3sfW1lb3CMAacA4YoIkAAzQRYIAmAgzQRIABmggwQBMBBmgiwABNBBigiQADNBFggCYCDNBEgAGaCDBAEwEGaCLAAE0EGKCJAAM0EWCAJgIM0KTGGLd/5aqnk3zm6MZ5Sc4m+Xz3EM3W/Wtg/dZ/XNf/LWOMV+09eKgAH2dVdXmMcbF7jk7r/jWwfutftfU7BQHQRIABmpykAD/YPcAxsO5fA+tfbyu3/hNzDhhg1Zyke8AAK0WAAZqsbICr6tNV9fGqulpVl6djd1fVh6rqU9P7V3TPuShV9e6qeqqqHt11bN/11txvVtWsqh6pqgt9ky/GAet/Z1XtTHvgalXdt+tjvzyt//Gq+qGeqRenql5dVf9YVZ+sqk9U1aXp+FrsgVusf7X3wBhjJd+SfDrJ2T3HfjXJ26fLb0/yru45F7jeNya5kOTRF1pvkvuS/G2SSvL6JB/pnv+I1v/OJL+0z3W/K8nDSV6W5DVJ/j3Jqe41vMT135PkwnT5riT/Nq1zLfbALda/0ntgZe8BH+D+JO+ZLr8nyQ83zrJQY4x/SvLMnsMHrff+JH805v41yTdW1T3LmfRoHLD+g9yf5H1jjC+NMf4zySzJ9x/ZcEswxnhijPHQdPmLSa4l2cia7IFbrP8gK7EHVjnAI8nfVdWVqnrbdOzcGOOJ6fLnkpzrGW1pDlrvRpLP7rref+XWm3WV/cL0EPvdu045nej1V9X5JN+X5CNZwz2wZ/3JCu+BVQ7wG8YYF5K8OcnPV9Ubd39wzB+HrM332K3beie/k+Rbk3xvkieS/HrvOEevqr4+yV8k+cUxxnO7P7YOe2Cf9a/0HljZAI8xdqb3TyX5q8wfXjx582HW9P6pvgmX4qD17iR59a7rfdN07EQZYzw5xnh+jPHVJL+X/3+IeSLXX1Vfk3l83jvG+Mvp8Nrsgf3Wv+p7YCUDXFVfV1V33byc5AeTPJrk/UneOl3trUn+umfCpTlove9P8pbpmfDXJ3l218PUE2PPOc0fyXwPJPP1/0RVvayqXpPk25J8dNnzLVJVVZI/SHJtjPEbuz60FnvgoPWv/B7ofhbwxbwleW3mz3A+nOQTSd4xHX9lkn9I8qkkf5/k7u5ZF7jmP838Idb/Zn4+66cOWm/mz3z/VubP/H48ycXu+Y9o/X88re+RzP/D3bPr+u+Y1v94kjd3z7+A9b8h89MLjyS5Or3dty574BbrX+k94EeRAZqs5CkIgJNAgAGaCDBAEwEGaCLAAE0EmJVXVb9SVW/qngMOy7ehsdKq6tQY4/nuOeDFcA+YY6uqzlfVY1X13qq6VlV/XlV3Tq8F/a6qeijJj1XVH1bVj05/596q+nBVPVxVH62qu6rqVFX9WlV9bHrRlp9pXhokEWCOv9cl+e0xxncmeS7Jz03HvzDGuDDGeN/NK1bV1yb5sySXxhjfk+RNSW5k/lNzz44x7k1yb5Kfnn48FVoJMMfdZ8cY/zJd/pPMfyQ1mYd2r9cleWKM8bEkGWM8N8b4SuavFfKWqrqa+UsYvjLz1waAVqe7B4AXsPdJipt//u9DfI5KsjXG+OBiRoLFcA+Y4+6bq+oHpss/meSfb3Hdx5PcU1X3Jsl0/vd0kg8m+dnp5QxTVd8+vYoetBJgjrvHM3/B/WtJXpH5C3Dva4zx5SQ/nmS7qh5O8qEkL0/y+0k+meSh6Zd6/m48+uMY8G1oHFvTr575mzHGdzePAkfCPWCAJu4BAzRxDxigiQADNBFggCYCDNBEgAGa/B8quxjd1RvOEQAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Km5wR5QtJrsj", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 296 | |
}, | |
"outputId": "39966afb-6ad3-45a0-9d6e-322acaf66197" | |
}, | |
"source": [ | |
"sns.distplot(bnb_no_outliers.price, bins=10)" | |
], | |
"execution_count": 96, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7ff28d7cedd8>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 96 | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "YtNwvtBsTZSG", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Nm-NTqEFTa9-", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# **Exercise 2**\n", | |
"---\n", | |
"source: [house prices](https://raw.githubusercontent.com/codebasics/py/master/ML/FeatureEngineering/2_outliers_z_score/Exercise/bhp.csv)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jwZuNSz0VG09", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"### Import dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CXc1NgdXTzsW", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 197 | |
}, | |
"outputId": "a9b0f9f5-6532-4e86-9cf0-4efe1cb2d6a3" | |
}, | |
"source": [ | |
"house = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/ML/FeatureEngineering/2_outliers_z_score/Exercise/bhp.csv')\n", | |
"house.head()" | |
], | |
"execution_count": 99, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>location</th>\n", | |
" <th>size</th>\n", | |
" <th>total_sqft</th>\n", | |
" <th>bath</th>\n", | |
" <th>price</th>\n", | |
" <th>bhk</th>\n", | |
" <th>price_per_sqft</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Electronic City Phase II</td>\n", | |
" <td>2 BHK</td>\n", | |
" <td>1056.0</td>\n", | |
" <td>2.0</td>\n", | |
" <td>39.07</td>\n", | |
" <td>2</td>\n", | |
" <td>3699</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Chikka Tirupathi</td>\n", | |
" <td>4 Bedroom</td>\n", | |
" <td>2600.0</td>\n", | |
" <td>5.0</td>\n", | |
" <td>120.00</td>\n", | |
" <td>4</td>\n", | |
" <td>4615</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Uttarahalli</td>\n", | |
" <td>3 BHK</td>\n", | |
" <td>1440.0</td>\n", | |
" <td>2.0</td>\n", | |
" <td>62.00</td>\n", | |
" <td>3</td>\n", | |
" <td>4305</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Lingadheeranahalli</td>\n", | |
" <td>3 BHK</td>\n", | |
" <td>1521.0</td>\n", | |
" <td>3.0</td>\n", | |
" <td>95.00</td>\n", | |
" <td>3</td>\n", | |
" <td>6245</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Kothanur</td>\n", | |
" <td>2 BHK</td>\n", | |
" <td>1200.0</td>\n", | |
" <td>2.0</td>\n", | |
" <td>51.00</td>\n", | |
" <td>2</td>\n", | |
" <td>4250</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" location size total_sqft ... price bhk price_per_sqft\n", | |
"0 Electronic City Phase II 2 BHK 1056.0 ... 39.07 2 3699\n", | |
"1 Chikka Tirupathi 4 Bedroom 2600.0 ... 120.00 4 4615\n", | |
"2 Uttarahalli 3 BHK 1440.0 ... 62.00 3 4305\n", | |
"3 Lingadheeranahalli 3 BHK 1521.0 ... 95.00 3 6245\n", | |
"4 Kothanur 2 BHK 1200.0 ... 51.00 2 4250\n", | |
"\n", | |
"[5 rows x 7 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 99 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wkFbg0N7VXUV", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 264 | |
}, | |
"outputId": "67e45da5-7a8d-416c-a80b-a83a37648b3c" | |
}, | |
"source": [ | |
"plt.hist(house.price, bins=20, rwidth=0.8, density=True)\n", | |
"\n", | |
"rng = np.arange(house.price.min(), house.price.max(), 0.1)\n", | |
"plt.plot(rng, norm.pdf(rng,house.price.mean(), house.price.std()))\n", | |
"plt.show()" | |
], | |
"execution_count": 126, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VvY2jmV4ZM6r", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## **Percentile**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "3vGoC_a1ZKGk", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "ae5b9621-7977-469a-d519-fb7ee5c73977" | |
}, | |
"source": [ | |
"min_house_price_threshold, max_house_price_threshold = house.price.quantile([0.001, 0.999])\n", | |
"min_house_price_threshold, max_house_price_threshold" | |
], | |
"execution_count": 128, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(11.5, 2000.0)" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 128 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Plo0n0XcZ0hN", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "f310fb5b-df0b-4acf-e180-feaeced8901b" | |
}, | |
"source": [ | |
"house_no_outliers = house[(house.price>min_house_price_threshold) & (house.price<max_house_price_threshold)]\n", | |
"house.shape, house_no_outliers.shape" | |
], | |
"execution_count": 129, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"((13200, 7), (13169, 7))" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 129 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "R-bgC2OhaTmV", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "8fb79e04-452a-4676-9c8e-d753fc19c92b" | |
}, | |
"source": [ | |
"# outliers removed\n", | |
"house.shape[0] - house_no_outliers.shape[0]" | |
], | |
"execution_count": 131, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"31" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 131 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LsY-wHmWbNEW", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 264 | |
}, | |
"outputId": "e14874e0-b38f-4ce7-8bab-21a54e3b4290" | |
}, | |
"source": [ | |
"plt.hist(house_no_outliers.price, bins=20, rwidth=0.8, density=True)\n", | |
"\n", | |
"rng = np.arange(house_no_outliers.price.min(), house_no_outliers.price.max(), 0.1)\n", | |
"plt.plot(rng, norm.pdf(rng,house_no_outliers.price.mean(), house_no_outliers.price.std()))\n", | |
"plt.show()" | |
], | |
"execution_count": 132, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "QTCBRuExaiqM", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## **IQR 4 std**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7eVhLOkSbHO1", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# Set house price bounds\n", | |
"upper_bound = house.price.mean() + 4*house.price.std()\n", | |
"lower_bound = house.price.mean() - 4*house.price.std()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FP7O-13aahRv", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "e6e534d1-1e0c-4a5a-88c5-9ec52bb421dd" | |
}, | |
"source": [ | |
"house_no_outliers_iqr = house[(house.price>lower_bound) & (house.price<upper_bound)]\n", | |
"house.shape, house_no_outliers_iqr.shape" | |
], | |
"execution_count": 137, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"((13200, 7), (13093, 7))" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 137 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7eqGL0R9cVQk", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "b0e7cb60-2b28-4ec7-e99e-4c82e39769c8" | |
}, | |
"source": [ | |
"house.shape[0] - house_no_outliers_iqr.shape[0]" | |
], | |
"execution_count": 138, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"107" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 138 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "aiUXHS4zclLt", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 267 | |
}, | |
"outputId": "ec5fe407-b3d1-476f-c5ee-05ebcfc90f4c" | |
}, | |
"source": [ | |
"plt.hist(house_no_outliers_iqr.price, bins=20, rwidth=0.8, density=True)\n", | |
"\n", | |
"rng = np.arange(house_no_outliers_iqr.price.min(), house_no_outliers_iqr.price.max(), 0.1)\n", | |
"plt.plot(rng, norm.pdf(rng,house_no_outliers_iqr.price.mean(), house_no_outliers_iqr.price.std()))\n", | |
"plt.show()" | |
], | |
"execution_count": 139, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "X8C_qMutc1V5", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"## **Z-Score**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "MDl9sRuuc6t3", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 197 | |
}, | |
"outputId": "776cb13f-5e1c-48e7-e435-6010288c7c50" | |
}, | |
"source": [ | |
"# calculate the z-score for each house price\n", | |
"house['price_Zscore'] = (house.price - house.price.mean()) / house.price.std()\n", | |
"house[['price','price_Zscore']].head()" | |
], | |
"execution_count": 143, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>price</th>\n", | |
" <th>price_Zscore</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>39.07</td>\n", | |
" <td>-0.490737</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>120.00</td>\n", | |
" <td>0.051777</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>62.00</td>\n", | |
" <td>-0.337026</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>95.00</td>\n", | |
" <td>-0.115811</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>51.00</td>\n", | |
" <td>-0.410764</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" price price_Zscore\n", | |
"0 39.07 -0.490737\n", | |
"1 120.00 0.051777\n", | |
"2 62.00 -0.337026\n", | |
"3 95.00 -0.115811\n", | |
"4 51.00 -0.410764" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 143 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G6sNLgW6eBc1", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "0834b913-5e29-4c92-f014-ae57783a81a8" | |
}, | |
"source": [ | |
"house_no_outliers_zscore = house[(house.price_Zscore>-4) & (house.price_Zscore<4)]\n", | |
"house.shape, house_no_outliers_zscore.shape" | |
], | |
"execution_count": 144, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"((13200, 8), (13093, 8))" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 144 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FV0D65e6eqFl", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "51744695-d42b-4303-85a5-963bcb3297da" | |
}, | |
"source": [ | |
"house.shape[0] - house_no_outliers_zscore.shape[0]" | |
], | |
"execution_count": 145, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"107" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 145 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Gr7ituEye3Z-", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 267 | |
}, | |
"outputId": "86109545-edec-4f59-82b0-63ee001858ec" | |
}, | |
"source": [ | |
"plt.hist(house_no_outliers_zscore.price, bins=20, rwidth=0.8, density=True)\n", | |
"\n", | |
"rng = np.arange(house_no_outliers_zscore.price.min(), house_no_outliers_zscore.price.max(), 0.1)\n", | |
"plt.plot(rng, norm.pdf(rng,house_no_outliers_zscore.price.mean(), house_no_outliers_zscore.price.std()))\n", | |
"plt.show()" | |
], | |
"execution_count": 146, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"tags": [], | |
"needs_background": "light" | |
} | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Good job.