Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save devops-school/aecdcaf418c1811fb24c252a6d12515e to your computer and use it in GitHub Desktop.
Save devops-school/aecdcaf418c1811fb24c252a6d12515e to your computer and use it in GitHub Desktop.
Machine Learning – scikit-learn – Lab 4
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" <th>ocean_proximity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10972</th>\n",
" <td>-117.84</td>\n",
" <td>33.76</td>\n",
" <td>14.0</td>\n",
" <td>1458.0</td>\n",
" <td>423.0</td>\n",
" <td>615.0</td>\n",
" <td>365.0</td>\n",
" <td>4.2798</td>\n",
" <td>218800.0</td>\n",
" <td>&lt;1H OCEAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19289</th>\n",
" <td>-122.82</td>\n",
" <td>38.41</td>\n",
" <td>32.0</td>\n",
" <td>701.0</td>\n",
" <td>182.0</td>\n",
" <td>489.0</td>\n",
" <td>168.0</td>\n",
" <td>2.7850</td>\n",
" <td>169300.0</td>\n",
" <td>&lt;1H OCEAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14111</th>\n",
" <td>-117.10</td>\n",
" <td>32.74</td>\n",
" <td>14.0</td>\n",
" <td>2361.0</td>\n",
" <td>601.0</td>\n",
" <td>1831.0</td>\n",
" <td>526.0</td>\n",
" <td>1.6102</td>\n",
" <td>93400.0</td>\n",
" <td>NEAR OCEAN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3253</th>\n",
" <td>-120.12</td>\n",
" <td>36.01</td>\n",
" <td>18.0</td>\n",
" <td>1165.0</td>\n",
" <td>334.0</td>\n",
" <td>1119.0</td>\n",
" <td>308.0</td>\n",
" <td>2.2167</td>\n",
" <td>48500.0</td>\n",
" <td>INLAND</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19941</th>\n",
" <td>-119.46</td>\n",
" <td>36.25</td>\n",
" <td>32.0</td>\n",
" <td>1702.0</td>\n",
" <td>348.0</td>\n",
" <td>1016.0</td>\n",
" <td>350.0</td>\n",
" <td>2.5000</td>\n",
" <td>73600.0</td>\n",
" <td>INLAND</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"10972 -117.84 33.76 14.0 1458.0 423.0 \n",
"19289 -122.82 38.41 32.0 701.0 182.0 \n",
"14111 -117.10 32.74 14.0 2361.0 601.0 \n",
"3253 -120.12 36.01 18.0 1165.0 334.0 \n",
"19941 -119.46 36.25 32.0 1702.0 348.0 \n",
"\n",
" population households median_income median_house_value \\\n",
"10972 615.0 365.0 4.2798 218800.0 \n",
"19289 489.0 168.0 2.7850 169300.0 \n",
"14111 1831.0 526.0 1.6102 93400.0 \n",
"3253 1119.0 308.0 2.2167 48500.0 \n",
"19941 1016.0 350.0 2.5000 73600.0 \n",
"\n",
" ocean_proximity \n",
"10972 <1H OCEAN \n",
"19289 <1H OCEAN \n",
"14111 NEAR OCEAN \n",
"3253 INLAND \n",
"19941 INLAND "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data = pd.read_csv('datasets/housing.csv')\n",
"\n",
"housing_data.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"housing_data = housing_data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(20433, 10)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" <td>20433.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>-119.570689</td>\n",
" <td>35.633221</td>\n",
" <td>28.633094</td>\n",
" <td>2636.504233</td>\n",
" <td>537.870553</td>\n",
" <td>1424.946949</td>\n",
" <td>499.433465</td>\n",
" <td>3.871162</td>\n",
" <td>206864.413155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2.003578</td>\n",
" <td>2.136348</td>\n",
" <td>12.591805</td>\n",
" <td>2185.269567</td>\n",
" <td>421.385070</td>\n",
" <td>1133.208490</td>\n",
" <td>382.299226</td>\n",
" <td>1.899291</td>\n",
" <td>115435.667099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-124.350000</td>\n",
" <td>32.540000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.499900</td>\n",
" <td>14999.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>-121.800000</td>\n",
" <td>33.930000</td>\n",
" <td>18.000000</td>\n",
" <td>1450.000000</td>\n",
" <td>296.000000</td>\n",
" <td>787.000000</td>\n",
" <td>280.000000</td>\n",
" <td>2.563700</td>\n",
" <td>119500.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>-118.490000</td>\n",
" <td>34.260000</td>\n",
" <td>29.000000</td>\n",
" <td>2127.000000</td>\n",
" <td>435.000000</td>\n",
" <td>1166.000000</td>\n",
" <td>409.000000</td>\n",
" <td>3.536500</td>\n",
" <td>179700.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>-118.010000</td>\n",
" <td>37.720000</td>\n",
" <td>37.000000</td>\n",
" <td>3143.000000</td>\n",
" <td>647.000000</td>\n",
" <td>1722.000000</td>\n",
" <td>604.000000</td>\n",
" <td>4.744000</td>\n",
" <td>264700.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>-114.310000</td>\n",
" <td>41.950000</td>\n",
" <td>52.000000</td>\n",
" <td>39320.000000</td>\n",
" <td>6445.000000</td>\n",
" <td>35682.000000</td>\n",
" <td>6082.000000</td>\n",
" <td>15.000100</td>\n",
" <td>500001.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms \\\n",
"count 20433.000000 20433.000000 20433.000000 20433.000000 \n",
"mean -119.570689 35.633221 28.633094 2636.504233 \n",
"std 2.003578 2.136348 12.591805 2185.269567 \n",
"min -124.350000 32.540000 1.000000 2.000000 \n",
"25% -121.800000 33.930000 18.000000 1450.000000 \n",
"50% -118.490000 34.260000 29.000000 2127.000000 \n",
"75% -118.010000 37.720000 37.000000 3143.000000 \n",
"max -114.310000 41.950000 52.000000 39320.000000 \n",
"\n",
" total_bedrooms population households median_income \\\n",
"count 20433.000000 20433.000000 20433.000000 20433.000000 \n",
"mean 537.870553 1424.946949 499.433465 3.871162 \n",
"std 421.385070 1133.208490 382.299226 1.899291 \n",
"min 1.000000 3.000000 1.000000 0.499900 \n",
"25% 296.000000 787.000000 280.000000 2.563700 \n",
"50% 435.000000 1166.000000 409.000000 3.536500 \n",
"75% 647.000000 1722.000000 604.000000 4.744000 \n",
"max 6445.000000 35682.000000 6082.000000 15.000100 \n",
"\n",
" median_house_value \n",
"count 20433.000000 \n",
"mean 206864.413155 \n",
"std 115435.667099 \n",
"min 14999.000000 \n",
"25% 119500.000000 \n",
"50% 179700.000000 \n",
"75% 264700.000000 \n",
"max 500001.000000 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"longitude 958\n",
"latitude 958\n",
"housing_median_age 958\n",
"total_rooms 958\n",
"total_bedrooms 958\n",
"population 958\n",
"households 958\n",
"median_income 958\n",
"median_house_value 958\n",
"ocean_proximity 958\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.loc[housing_data['median_house_value'] == 500001].count()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(19475, 10)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" <th>ocean_proximity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-122.23</td>\n",
" <td>37.88</td>\n",
" <td>41.0</td>\n",
" <td>880.0</td>\n",
" <td>129.0</td>\n",
" <td>322.0</td>\n",
" <td>126.0</td>\n",
" <td>8.3252</td>\n",
" <td>452600.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-122.22</td>\n",
" <td>37.86</td>\n",
" <td>21.0</td>\n",
" <td>7099.0</td>\n",
" <td>1106.0</td>\n",
" <td>2401.0</td>\n",
" <td>1138.0</td>\n",
" <td>8.3014</td>\n",
" <td>358500.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-122.24</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1467.0</td>\n",
" <td>190.0</td>\n",
" <td>496.0</td>\n",
" <td>177.0</td>\n",
" <td>7.2574</td>\n",
" <td>352100.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-122.25</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1274.0</td>\n",
" <td>235.0</td>\n",
" <td>558.0</td>\n",
" <td>219.0</td>\n",
" <td>5.6431</td>\n",
" <td>341300.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-122.25</td>\n",
" <td>37.85</td>\n",
" <td>52.0</td>\n",
" <td>1627.0</td>\n",
" <td>280.0</td>\n",
" <td>565.0</td>\n",
" <td>259.0</td>\n",
" <td>3.8462</td>\n",
" <td>342200.0</td>\n",
" <td>NEAR BAY</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"0 -122.23 37.88 41.0 880.0 129.0 \n",
"1 -122.22 37.86 21.0 7099.0 1106.0 \n",
"2 -122.24 37.85 52.0 1467.0 190.0 \n",
"3 -122.25 37.85 52.0 1274.0 235.0 \n",
"4 -122.25 37.85 52.0 1627.0 280.0 \n",
"\n",
" population households median_income median_house_value ocean_proximity \n",
"0 322.0 126.0 8.3252 452600.0 NEAR BAY \n",
"1 2401.0 1138.0 8.3014 358500.0 NEAR BAY \n",
"2 496.0 177.0 7.2574 352100.0 NEAR BAY \n",
"3 558.0 219.0 5.6431 341300.0 NEAR BAY \n",
"4 565.0 259.0 3.8462 342200.0 NEAR BAY "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],\n",
" dtype=object)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data['ocean_proximity'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(19475, 14)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" <th>ocean_proximity_&lt;1H OCEAN</th>\n",
" <th>ocean_proximity_INLAND</th>\n",
" <th>ocean_proximity_ISLAND</th>\n",
" <th>ocean_proximity_NEAR BAY</th>\n",
" <th>ocean_proximity_NEAR OCEAN</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16063</th>\n",
" <td>-122.48</td>\n",
" <td>37.75</td>\n",
" <td>52.0</td>\n",
" <td>2515.0</td>\n",
" <td>494.0</td>\n",
" <td>1583.0</td>\n",
" <td>477.0</td>\n",
" <td>4.3393</td>\n",
" <td>317600.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14867</th>\n",
" <td>-117.09</td>\n",
" <td>32.64</td>\n",
" <td>20.0</td>\n",
" <td>1999.0</td>\n",
" <td>651.0</td>\n",
" <td>1302.0</td>\n",
" <td>592.0</td>\n",
" <td>1.6321</td>\n",
" <td>57500.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6671</th>\n",
" <td>-118.11</td>\n",
" <td>34.16</td>\n",
" <td>52.0</td>\n",
" <td>1353.0</td>\n",
" <td>274.0</td>\n",
" <td>852.0</td>\n",
" <td>306.0</td>\n",
" <td>3.4583</td>\n",
" <td>239900.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17744</th>\n",
" <td>-121.76</td>\n",
" <td>37.29</td>\n",
" <td>15.0</td>\n",
" <td>2267.0</td>\n",
" <td>348.0</td>\n",
" <td>1150.0</td>\n",
" <td>327.0</td>\n",
" <td>7.1267</td>\n",
" <td>277900.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3941</th>\n",
" <td>-118.59</td>\n",
" <td>34.21</td>\n",
" <td>34.0</td>\n",
" <td>2389.0</td>\n",
" <td>521.0</td>\n",
" <td>1560.0</td>\n",
" <td>514.0</td>\n",
" <td>4.8333</td>\n",
" <td>225400.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"16063 -122.48 37.75 52.0 2515.0 494.0 \n",
"14867 -117.09 32.64 20.0 1999.0 651.0 \n",
"6671 -118.11 34.16 52.0 1353.0 274.0 \n",
"17744 -121.76 37.29 15.0 2267.0 348.0 \n",
"3941 -118.59 34.21 34.0 2389.0 521.0 \n",
"\n",
" population households median_income median_house_value \\\n",
"16063 1583.0 477.0 4.3393 317600.0 \n",
"14867 1302.0 592.0 1.6321 57500.0 \n",
"6671 852.0 306.0 3.4583 239900.0 \n",
"17744 1150.0 327.0 7.1267 277900.0 \n",
"3941 1560.0 514.0 4.8333 225400.0 \n",
"\n",
" ocean_proximity_<1H OCEAN ocean_proximity_INLAND \\\n",
"16063 0 0 \n",
"14867 0 0 \n",
"6671 0 1 \n",
"17744 1 0 \n",
"3941 1 0 \n",
"\n",
" ocean_proximity_ISLAND ocean_proximity_NEAR BAY \\\n",
"16063 0 1 \n",
"14867 0 0 \n",
"6671 0 0 \n",
"17744 0 0 \n",
"3941 0 0 \n",
"\n",
" ocean_proximity_NEAR OCEAN \n",
"16063 0 \n",
"14867 1 \n",
"6671 0 \n",
"17744 0 \n",
"3941 0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"173800.0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"median = housing_data['median_house_value'].median()\n",
"\n",
"median"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>housing_median_age</th>\n",
" <th>total_rooms</th>\n",
" <th>total_bedrooms</th>\n",
" <th>population</th>\n",
" <th>households</th>\n",
" <th>median_income</th>\n",
" <th>median_house_value</th>\n",
" <th>ocean_proximity_&lt;1H OCEAN</th>\n",
" <th>ocean_proximity_INLAND</th>\n",
" <th>ocean_proximity_ISLAND</th>\n",
" <th>ocean_proximity_NEAR BAY</th>\n",
" <th>ocean_proximity_NEAR OCEAN</th>\n",
" <th>above_median</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11995</th>\n",
" <td>-117.55</td>\n",
" <td>34.00</td>\n",
" <td>17.0</td>\n",
" <td>3583.0</td>\n",
" <td>700.0</td>\n",
" <td>1587.0</td>\n",
" <td>719.0</td>\n",
" <td>2.6979</td>\n",
" <td>75000.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20401</th>\n",
" <td>-118.85</td>\n",
" <td>34.21</td>\n",
" <td>29.0</td>\n",
" <td>2195.0</td>\n",
" <td>414.0</td>\n",
" <td>1360.0</td>\n",
" <td>401.0</td>\n",
" <td>3.4773</td>\n",
" <td>206700.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17514</th>\n",
" <td>-121.92</td>\n",
" <td>37.33</td>\n",
" <td>52.0</td>\n",
" <td>2962.0</td>\n",
" <td>557.0</td>\n",
" <td>1215.0</td>\n",
" <td>506.0</td>\n",
" <td>4.7768</td>\n",
" <td>301100.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19245</th>\n",
" <td>-122.75</td>\n",
" <td>38.50</td>\n",
" <td>16.0</td>\n",
" <td>4196.0</td>\n",
" <td>638.0</td>\n",
" <td>1713.0</td>\n",
" <td>615.0</td>\n",
" <td>5.4490</td>\n",
" <td>252100.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11480</th>\n",
" <td>-118.03</td>\n",
" <td>33.72</td>\n",
" <td>24.0</td>\n",
" <td>5203.0</td>\n",
" <td>957.0</td>\n",
" <td>2465.0</td>\n",
" <td>946.0</td>\n",
" <td>5.1630</td>\n",
" <td>261000.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
"11995 -117.55 34.00 17.0 3583.0 700.0 \n",
"20401 -118.85 34.21 29.0 2195.0 414.0 \n",
"17514 -121.92 37.33 52.0 2962.0 557.0 \n",
"19245 -122.75 38.50 16.0 4196.0 638.0 \n",
"11480 -118.03 33.72 24.0 5203.0 957.0 \n",
"\n",
" population households median_income median_house_value \\\n",
"11995 1587.0 719.0 2.6979 75000.0 \n",
"20401 1360.0 401.0 3.4773 206700.0 \n",
"17514 1215.0 506.0 4.7768 301100.0 \n",
"19245 1713.0 615.0 5.4490 252100.0 \n",
"11480 2465.0 946.0 5.1630 261000.0 \n",
"\n",
" ocean_proximity_<1H OCEAN ocean_proximity_INLAND \\\n",
"11995 0 1 \n",
"20401 1 0 \n",
"17514 1 0 \n",
"19245 1 0 \n",
"11480 1 0 \n",
"\n",
" ocean_proximity_ISLAND ocean_proximity_NEAR BAY \\\n",
"11995 0 0 \n",
"20401 0 0 \n",
"17514 0 0 \n",
"19245 0 0 \n",
"11480 0 0 \n",
"\n",
" ocean_proximity_NEAR OCEAN above_median \n",
"11995 0 False \n",
"20401 0 True \n",
"17514 0 True \n",
"19245 0 True \n",
"11480 0 True "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing_data.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"X = housing_data.drop(['median_house_value', 'above_median'], axis=1)\n",
"Y = housing_data['above_median']"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n",
" 'total_bedrooms', 'population', 'households', 'median_income',\n",
" 'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',\n",
" 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',\n",
" 'ocean_proximity_NEAR OCEAN'],\n",
" dtype='object')"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.columns"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((15580, 13), (3895, 13))"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_train.shape, x_test.shape"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((15580,), (3895,))"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train.shape, y_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train a logistic regression model for price classification"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training_score : 0.8198973042362002\n"
]
}
],
"source": [
"print(\"Training_score : \" , logistic_model.score(x_train, y_train))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"y_pred = logistic_model.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>predicted</th>\n",
" <th>actual</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>18998</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6709</th>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8434</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16288</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4709</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2659</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18509</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15204</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9734</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4638</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" predicted actual\n",
"18998 False False\n",
"6709 False True\n",
"8434 True True\n",
"16288 False False\n",
"4709 True True\n",
"2659 False False\n",
"18509 True True\n",
"15204 True True\n",
"9734 True True\n",
"4638 True True"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})\n",
"\n",
"df_pred_actual.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing_score : 0.8215661103979461\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"print(\"Testing_score : \", accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment