Created
May 29, 2020 09:31
-
-
Save Akash671/e6d52d4c7e785b6ba534a4d6716653b2 to your computer and use it in GitHub Desktop.
Created on Skills Network Labs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import matplotlib.pylab as plt\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv'" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"filename" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"headers = [\"symboling\",\"normalized-losses\",\"make\",\"fuel-type\",\"aspiration\", \"num-of-doors\",\"body-style\",\n", | |
" \"drive-wheels\",\"engine-location\",\"wheel-base\", \"length\",\"width\",\"height\",\"curb-weight\",\"engine-type\",\n", | |
" \"num-of-cylinders\", \"engine-size\",\"fuel-system\",\"bore\",\"stroke\",\"compression-ratio\",\"horsepower\",\n", | |
" \"peak-rpm\",\"city-mpg\",\"highway-mpg\",\"price\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"outputs_hidden": false | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(filename, names = headers)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>200</th>\n", | |
" <td>-1</td>\n", | |
" <td>95</td>\n", | |
" <td>volvo</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>109.1</td>\n", | |
" <td>...</td>\n", | |
" <td>141</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.78</td>\n", | |
" <td>3.15</td>\n", | |
" <td>9.5</td>\n", | |
" <td>114</td>\n", | |
" <td>5400</td>\n", | |
" <td>23</td>\n", | |
" <td>28</td>\n", | |
" <td>16845</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>201</th>\n", | |
" <td>-1</td>\n", | |
" <td>95</td>\n", | |
" <td>volvo</td>\n", | |
" <td>gas</td>\n", | |
" <td>turbo</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>109.1</td>\n", | |
" <td>...</td>\n", | |
" <td>141</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.78</td>\n", | |
" <td>3.15</td>\n", | |
" <td>8.7</td>\n", | |
" <td>160</td>\n", | |
" <td>5300</td>\n", | |
" <td>19</td>\n", | |
" <td>25</td>\n", | |
" <td>19045</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>202</th>\n", | |
" <td>-1</td>\n", | |
" <td>95</td>\n", | |
" <td>volvo</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>109.1</td>\n", | |
" <td>...</td>\n", | |
" <td>173</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.58</td>\n", | |
" <td>2.87</td>\n", | |
" <td>8.8</td>\n", | |
" <td>134</td>\n", | |
" <td>5500</td>\n", | |
" <td>18</td>\n", | |
" <td>23</td>\n", | |
" <td>21485</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>203</th>\n", | |
" <td>-1</td>\n", | |
" <td>95</td>\n", | |
" <td>volvo</td>\n", | |
" <td>diesel</td>\n", | |
" <td>turbo</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>109.1</td>\n", | |
" <td>...</td>\n", | |
" <td>145</td>\n", | |
" <td>idi</td>\n", | |
" <td>3.01</td>\n", | |
" <td>3.40</td>\n", | |
" <td>23.0</td>\n", | |
" <td>106</td>\n", | |
" <td>4800</td>\n", | |
" <td>26</td>\n", | |
" <td>27</td>\n", | |
" <td>22470</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>204</th>\n", | |
" <td>-1</td>\n", | |
" <td>95</td>\n", | |
" <td>volvo</td>\n", | |
" <td>gas</td>\n", | |
" <td>turbo</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>109.1</td>\n", | |
" <td>...</td>\n", | |
" <td>141</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.78</td>\n", | |
" <td>3.15</td>\n", | |
" <td>9.5</td>\n", | |
" <td>114</td>\n", | |
" <td>5400</td>\n", | |
" <td>19</td>\n", | |
" <td>25</td>\n", | |
" <td>22625</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>205 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 ? alfa-romero gas std \n", | |
"1 3 ? alfa-romero gas std \n", | |
"2 1 ? alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
".. ... ... ... ... ... \n", | |
"200 -1 95 volvo gas std \n", | |
"201 -1 95 volvo gas turbo \n", | |
"202 -1 95 volvo gas std \n", | |
"203 -1 95 volvo diesel turbo \n", | |
"204 -1 95 volvo gas turbo \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
".. ... ... ... ... ... ... \n", | |
"200 four sedan rwd front 109.1 ... \n", | |
"201 four sedan rwd front 109.1 ... \n", | |
"202 four sedan rwd front 109.1 ... \n", | |
"203 four sedan rwd front 109.1 ... \n", | |
"204 four sedan rwd front 109.1 ... \n", | |
"\n", | |
" engine-size fuel-system bore stroke compression-ratio horsepower \\\n", | |
"0 130 mpfi 3.47 2.68 9.0 111 \n", | |
"1 130 mpfi 3.47 2.68 9.0 111 \n", | |
"2 152 mpfi 2.68 3.47 9.0 154 \n", | |
"3 109 mpfi 3.19 3.40 10.0 102 \n", | |
"4 136 mpfi 3.19 3.40 8.0 115 \n", | |
".. ... ... ... ... ... ... \n", | |
"200 141 mpfi 3.78 3.15 9.5 114 \n", | |
"201 141 mpfi 3.78 3.15 8.7 160 \n", | |
"202 173 mpfi 3.58 2.87 8.8 134 \n", | |
"203 145 idi 3.01 3.40 23.0 106 \n", | |
"204 141 mpfi 3.78 3.15 9.5 114 \n", | |
"\n", | |
" peak-rpm city-mpg highway-mpg price \n", | |
"0 5000 21 27 13495 \n", | |
"1 5000 21 27 16500 \n", | |
"2 5000 19 26 16500 \n", | |
"3 5500 24 30 13950 \n", | |
"4 5500 18 22 17450 \n", | |
".. ... ... ... ... \n", | |
"200 5400 23 28 16845 \n", | |
"201 5300 19 25 19045 \n", | |
"202 5500 18 23 21485 \n", | |
"203 4800 26 27 22470 \n", | |
"204 5400 19 25 22625 \n", | |
"\n", | |
"[205 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>?</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration num-of-doors \\\n", | |
"0 3 ? alfa-romero gas std two \n", | |
"1 3 ? alfa-romero gas std two \n", | |
"2 1 ? alfa-romero gas std two \n", | |
"3 2 164 audi gas std four \n", | |
"4 2 164 audi gas std four \n", | |
"\n", | |
" body-style drive-wheels engine-location wheel-base ... engine-size \\\n", | |
"0 convertible rwd front 88.6 ... 130 \n", | |
"1 convertible rwd front 88.6 ... 130 \n", | |
"2 hatchback rwd front 94.5 ... 152 \n", | |
"3 sedan fwd front 99.8 ... 109 \n", | |
"4 sedan 4wd front 99.4 ... 136 \n", | |
"\n", | |
" fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg \\\n", | |
"0 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"1 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"2 mpfi 2.68 3.47 9.0 154 5000 19 \n", | |
"3 mpfi 3.19 3.40 10.0 102 5500 24 \n", | |
"4 mpfi 3.19 3.40 8.0 115 5500 18 \n", | |
"\n", | |
" highway-mpg price \n", | |
"0 27 13495 \n", | |
"1 27 16500 \n", | |
"2 26 16500 \n", | |
"3 30 13950 \n", | |
"4 22 17450 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"symboling int64\n", | |
"normalized-losses object\n", | |
"make object\n", | |
"fuel-type object\n", | |
"aspiration object\n", | |
"num-of-doors object\n", | |
"body-style object\n", | |
"drive-wheels object\n", | |
"engine-location object\n", | |
"wheel-base float64\n", | |
"length float64\n", | |
"width float64\n", | |
"height float64\n", | |
"curb-weight int64\n", | |
"engine-type object\n", | |
"num-of-cylinders object\n", | |
"engine-size int64\n", | |
"fuel-system object\n", | |
"bore object\n", | |
"stroke object\n", | |
"compression-ratio float64\n", | |
"horsepower object\n", | |
"peak-rpm object\n", | |
"city-mpg int64\n", | |
"highway-mpg int64\n", | |
"price object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"outputs_hidden": false | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'dentify and handle missing values\\nIdentify missing values\\nConvert \"?\" to NaN\\nIn the car dataset, missing data comes with the question mark \"?\". We replace \"?\" with NaN (Not a Number), which is Python\\'s default missing value marker, for reasons of computational speed and convenience. Here we use the function:\\n.replace(A B, inplace = True) \\nto replace A by B'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Steps for working with missing data:\n", | |
"\n", | |
"# 1.dentify missing data\n", | |
"# 2.deal with missing data\n", | |
"# 3.correct data format\n", | |
"\n", | |
"\n", | |
"'''dentify and handle missing values\n", | |
"Identify missing values\n", | |
"Convert \"?\" to NaN\n", | |
"In the car dataset, missing data comes with the question mark \"?\". We replace \"?\" with NaN (Not a Number), which is Python's default missing value marker, for reasons of computational speed and convenience. Here we use the function:\n", | |
".replace(A B, inplace = True) \n", | |
"to replace A by B'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>NaN</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>NaN</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>NaN</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration num-of-doors \\\n", | |
"0 3 NaN alfa-romero gas std two \n", | |
"1 3 NaN alfa-romero gas std two \n", | |
"2 1 NaN alfa-romero gas std two \n", | |
"3 2 164 audi gas std four \n", | |
"4 2 164 audi gas std four \n", | |
"\n", | |
" body-style drive-wheels engine-location wheel-base ... engine-size \\\n", | |
"0 convertible rwd front 88.6 ... 130 \n", | |
"1 convertible rwd front 88.6 ... 130 \n", | |
"2 hatchback rwd front 94.5 ... 152 \n", | |
"3 sedan fwd front 99.8 ... 109 \n", | |
"4 sedan 4wd front 99.4 ... 136 \n", | |
"\n", | |
" fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg \\\n", | |
"0 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"1 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"2 mpfi 2.68 3.47 9.0 154 5000 19 \n", | |
"3 mpfi 3.19 3.40 10.0 102 5500 24 \n", | |
"4 mpfi 3.19 3.40 8.0 115 5500 18 \n", | |
"\n", | |
" highway-mpg price \n", | |
"0 27 13495 \n", | |
"1 27 16500 \n", | |
"2 26 16500 \n", | |
"3 30 13950 \n", | |
"4 22 17450 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#\n", | |
"import numpy as np\n", | |
"\n", | |
"# replace \"?\" to NaN\n", | |
"\n", | |
"df.replace(\"?\", np.nan, inplace = True) # numpy function \n", | |
"df.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false, | |
"jupyter": { | |
"outputs_hidden": false | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>False</td>\n", | |
" <td>True</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>...</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>False</td>\n", | |
" <td>True</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>...</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>False</td>\n", | |
" <td>True</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>...</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>...</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>...</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" <td>False</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration num-of-doors \\\n", | |
"0 False True False False False False \n", | |
"1 False True False False False False \n", | |
"2 False True False False False False \n", | |
"3 False False False False False False \n", | |
"4 False False False False False False \n", | |
"\n", | |
" body-style drive-wheels engine-location wheel-base ... engine-size \\\n", | |
"0 False False False False ... False \n", | |
"1 False False False False ... False \n", | |
"2 False False False False ... False \n", | |
"3 False False False False ... False \n", | |
"4 False False False False ... False \n", | |
"\n", | |
" fuel-system bore stroke compression-ratio horsepower peak-rpm \\\n", | |
"0 False False False False False False \n", | |
"1 False False False False False False \n", | |
"2 False False False False False False \n", | |
"3 False False False False False False \n", | |
"4 False False False False False False \n", | |
"\n", | |
" city-mpg highway-mpg price \n", | |
"0 False False False \n", | |
"1 False False False \n", | |
"2 False False False \n", | |
"3 False False False \n", | |
"4 False False False \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"missing_data = df.isnull()\n", | |
"missing_data.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"symboling\n", | |
"False 205\n", | |
"Name: symboling, dtype: int64\n", | |
"\n", | |
"normalized-losses\n", | |
"False 164\n", | |
"True 41\n", | |
"Name: normalized-losses, dtype: int64\n", | |
"\n", | |
"make\n", | |
"False 205\n", | |
"Name: make, dtype: int64\n", | |
"\n", | |
"fuel-type\n", | |
"False 205\n", | |
"Name: fuel-type, dtype: int64\n", | |
"\n", | |
"aspiration\n", | |
"False 205\n", | |
"Name: aspiration, dtype: int64\n", | |
"\n", | |
"num-of-doors\n", | |
"False 203\n", | |
"True 2\n", | |
"Name: num-of-doors, dtype: int64\n", | |
"\n", | |
"body-style\n", | |
"False 205\n", | |
"Name: body-style, dtype: int64\n", | |
"\n", | |
"drive-wheels\n", | |
"False 205\n", | |
"Name: drive-wheels, dtype: int64\n", | |
"\n", | |
"engine-location\n", | |
"False 205\n", | |
"Name: engine-location, dtype: int64\n", | |
"\n", | |
"wheel-base\n", | |
"False 205\n", | |
"Name: wheel-base, dtype: int64\n", | |
"\n", | |
"length\n", | |
"False 205\n", | |
"Name: length, dtype: int64\n", | |
"\n", | |
"width\n", | |
"False 205\n", | |
"Name: width, dtype: int64\n", | |
"\n", | |
"height\n", | |
"False 205\n", | |
"Name: height, dtype: int64\n", | |
"\n", | |
"curb-weight\n", | |
"False 205\n", | |
"Name: curb-weight, dtype: int64\n", | |
"\n", | |
"engine-type\n", | |
"False 205\n", | |
"Name: engine-type, dtype: int64\n", | |
"\n", | |
"num-of-cylinders\n", | |
"False 205\n", | |
"Name: num-of-cylinders, dtype: int64\n", | |
"\n", | |
"engine-size\n", | |
"False 205\n", | |
"Name: engine-size, dtype: int64\n", | |
"\n", | |
"fuel-system\n", | |
"False 205\n", | |
"Name: fuel-system, dtype: int64\n", | |
"\n", | |
"bore\n", | |
"False 201\n", | |
"True 4\n", | |
"Name: bore, dtype: int64\n", | |
"\n", | |
"stroke\n", | |
"False 201\n", | |
"True 4\n", | |
"Name: stroke, dtype: int64\n", | |
"\n", | |
"compression-ratio\n", | |
"False 205\n", | |
"Name: compression-ratio, dtype: int64\n", | |
"\n", | |
"horsepower\n", | |
"False 203\n", | |
"True 2\n", | |
"Name: horsepower, dtype: int64\n", | |
"\n", | |
"peak-rpm\n", | |
"False 203\n", | |
"True 2\n", | |
"Name: peak-rpm, dtype: int64\n", | |
"\n", | |
"city-mpg\n", | |
"False 205\n", | |
"Name: city-mpg, dtype: int64\n", | |
"\n", | |
"highway-mpg\n", | |
"False 205\n", | |
"Name: highway-mpg, dtype: int64\n", | |
"\n", | |
"price\n", | |
"False 201\n", | |
"True 4\n", | |
"Name: price, dtype: int64\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"#count missing values in each column\n", | |
"'''for column in missing_data.columns.values.tolist():\n", | |
" print(column)\n", | |
" print (missing_data[column].value_counts())\n", | |
" print(\"\")'''\n", | |
"\n", | |
"\n", | |
"\n", | |
"for column in missing_data.columns.values.tolist():\n", | |
" print(column)\n", | |
" print(missing_data[column].value_counts())\n", | |
" print(\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Average of normalized-losses: 122.0\n" | |
] | |
} | |
], | |
"source": [ | |
"#Calculate the average of the column \n", | |
"\n", | |
"avg_norm_loss = df[\"normalized-losses\"].astype(\"float\").mean(axis=0)\n", | |
"print(\"Average of normalized-losses:\", avg_norm_loss)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Replace \"NaN\" by mean value in \"normalized-losses\" column\n", | |
"\n", | |
"# syntax df[\"normalized-losses\"].replace(np.nan, avg_norm_loss, inplace=True)\n", | |
"\n", | |
"df[\"normalized-losses\"].replace(np.nan,avg_norm_loss,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Average of bore: 3.3297512437810943\n" | |
] | |
} | |
], | |
"source": [ | |
"# similarly for another columns\n", | |
"\n", | |
"avg_bore=df['bore'].astype('float').mean(axis=0)\n", | |
"print(\"Average of bore:\", avg_bore)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[\"bore\"].replace(np.nan,avg_bore,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Average of stroke: 3.255422885572139\n" | |
] | |
} | |
], | |
"source": [ | |
"#for stroke columns \n", | |
"avg_stroke=df[\"stroke\"].astype(\"float\").mean(axis=0)\n", | |
"print(\"Average of stroke:\",avg_stroke)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# replace value inplace nan\n", | |
"df[\"stroke\"].replace(np.nan,avg_stroke,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Average horsepower: 104.25615763546799\n" | |
] | |
} | |
], | |
"source": [ | |
"# similarly for horspower\n", | |
"avg_horsepower = df['horsepower'].astype('float').mean(axis=0)\n", | |
"print(\"Average horsepower:\", avg_horsepower)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# replacement vaues inplace nan\n", | |
"df[\"horsepower\"].replace(np.nan,avg_horsepower,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Average of peak-rpm: 5125.369458128079\n" | |
] | |
} | |
], | |
"source": [ | |
"# similarly for column \"peak-rpm\"\n", | |
"avg_peak_rpm=df[\"peak-rpm\"].astype(\"float\").mean(axis=0)\n", | |
"print(\"Average of peak-rpm:\",avg_peak_rpm)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# replace values\n", | |
"df[\"peak-rpm\"].replace(np.nan,avg_peak_rpm,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"four 114\n", | |
"two 89\n", | |
"Name: num-of-doors, dtype: int64" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#To see which values are present in a particular column, we can use the \".value_counts()\" method:\n", | |
"# we replace missing value by ussing frequency method\n", | |
"\n", | |
"df['num-of-doors'].value_counts()\n", | |
"df[\"num-of-doors\"].value_counts()\n", | |
"\n", | |
"#We can see that four doors are the most common type. We can also use the \".idxmax()\" method to calculate for us the most common type automatically:\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'four'" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['num-of-doors'].value_counts().idxmax() # find max frequency number" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#The replacement procedure is very similar to what we have seen previously\n", | |
"\n", | |
"#replace the missing 'num-of-doors' values by the most frequent \n", | |
"df[\"num-of-doors\"].replace(np.nan, \"four\", inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# now using drop methon \n", | |
"#Finally, let's drop all rows that do not have price data:\n", | |
"\n", | |
"# simply drop whole row with NaN in \"price\" column\n", | |
"\n", | |
"df.dropna(subset=[\"price\"],axis=0,inplace=True)\n", | |
"\n", | |
"# reset index, because we droped two row\n", | |
"\n", | |
"df.reset_index(drop=True,inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration num-of-doors \\\n", | |
"0 3 122 alfa-romero gas std two \n", | |
"1 3 122 alfa-romero gas std two \n", | |
"2 1 122 alfa-romero gas std two \n", | |
"3 2 164 audi gas std four \n", | |
"4 2 164 audi gas std four \n", | |
"\n", | |
" body-style drive-wheels engine-location wheel-base ... engine-size \\\n", | |
"0 convertible rwd front 88.6 ... 130 \n", | |
"1 convertible rwd front 88.6 ... 130 \n", | |
"2 hatchback rwd front 94.5 ... 152 \n", | |
"3 sedan fwd front 99.8 ... 109 \n", | |
"4 sedan 4wd front 99.4 ... 136 \n", | |
"\n", | |
" fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg \\\n", | |
"0 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"1 mpfi 3.47 2.68 9.0 111 5000 21 \n", | |
"2 mpfi 2.68 3.47 9.0 154 5000 19 \n", | |
"3 mpfi 3.19 3.40 10.0 102 5500 24 \n", | |
"4 mpfi 3.19 3.40 8.0 115 5500 18 \n", | |
"\n", | |
" highway-mpg price \n", | |
"0 27 13495 \n", | |
"1 27 16500 \n", | |
"2 26 16500 \n", | |
"3 30 13950 \n", | |
"4 22 17450 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Good! Now, we obtain the dataset with no missing values." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'We are almost there!\\nThe last step in data cleaning is checking and making sure that all data is in the correct format (int, float, text or other).\\n\\nIn Pandas, we use\\n\\n.dtype() to check the data type\\n\\n.astype() to change the data type\\n\\nLets list the data types for each column¶'" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#now next step is Correct data format\n", | |
"'''We are almost there!\n", | |
"The last step in data cleaning is checking and making sure that all data is in the correct format (int, float, text or other).\n", | |
"\n", | |
"In Pandas, we use\n", | |
"\n", | |
".dtype() to check the data type\n", | |
"\n", | |
".astype() to change the data type\n", | |
"\n", | |
"Lets list the data types for each column¶'''\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"symboling int64\n", | |
"normalized-losses object\n", | |
"make object\n", | |
"fuel-type object\n", | |
"aspiration object\n", | |
"num-of-doors object\n", | |
"body-style object\n", | |
"drive-wheels object\n", | |
"engine-location object\n", | |
"wheel-base float64\n", | |
"length float64\n", | |
"width float64\n", | |
"height float64\n", | |
"curb-weight int64\n", | |
"engine-type object\n", | |
"num-of-cylinders object\n", | |
"engine-size int64\n", | |
"fuel-system object\n", | |
"bore object\n", | |
"stroke object\n", | |
"compression-ratio float64\n", | |
"horsepower object\n", | |
"peak-rpm object\n", | |
"city-mpg int64\n", | |
"highway-mpg int64\n", | |
"price object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"'''As we can see above, some columns are not of the correct data type. Numerical variables should have type 'float' or 'int',\n", | |
"and variables with strings such as categories should have type 'object'. For example, \n", | |
"'bore' and 'stroke' variables are numerical values that describe the engines,\n", | |
"so we should expect them to be of the type 'float' or 'int'; however, they are shown as type 'object'. \n", | |
"We have to convert data types into a proper format for each column using the \"astype()\" method.'''\n", | |
"\n", | |
"#Convert data types to proper format¶\n", | |
"\n", | |
"df[[\"bore\", \"stroke\"]] = df[[\"bore\", \"stroke\"]].astype(\"float\")\n", | |
"df[[\"normalized-losses\"]] = df[[\"normalized-losses\"]].astype(\"int\")\n", | |
"df[[\"price\"]] = df[[\"price\"]].astype(\"float\")\n", | |
"df[[\"peak-rpm\"]] = df[[\"peak-rpm\"]].astype(\"float\")\n", | |
"df[[\"bore\",\"stroke\"]]=df[[\"bore\",\"stroke\"]].astype(\"float\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"symboling int64\n", | |
"normalized-losses int64\n", | |
"make object\n", | |
"fuel-type object\n", | |
"aspiration object\n", | |
"num-of-doors object\n", | |
"body-style object\n", | |
"drive-wheels object\n", | |
"engine-location object\n", | |
"wheel-base float64\n", | |
"length float64\n", | |
"width float64\n", | |
"height float64\n", | |
"curb-weight int64\n", | |
"engine-type object\n", | |
"num-of-cylinders object\n", | |
"engine-size int64\n", | |
"fuel-system object\n", | |
"bore float64\n", | |
"stroke float64\n", | |
"compression-ratio float64\n", | |
"horsepower object\n", | |
"peak-rpm float64\n", | |
"city-mpg int64\n", | |
"highway-mpg int64\n", | |
"price float64\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Let us list the columns after the conversion\n", | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Wonderful!\n", | |
"\n", | |
"#Now, we finally obtain the cleaned dataset with no missing values and all data in its proper format." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" engine-size fuel-system bore stroke compression-ratio horsepower \\\n", | |
"0 130 mpfi 3.47 2.68 9.0 111 \n", | |
"1 130 mpfi 3.47 2.68 9.0 111 \n", | |
"2 152 mpfi 2.68 3.47 9.0 154 \n", | |
"3 109 mpfi 3.19 3.40 10.0 102 \n", | |
"4 136 mpfi 3.19 3.40 8.0 115 \n", | |
"\n", | |
" peak-rpm city-mpg highway-mpg price \n", | |
"0 5000.0 21 27 13495.0 \n", | |
"1 5000.0 21 27 16500.0 \n", | |
"2 5000.0 19 26 16500.0 \n", | |
"3 5500.0 24 30 13950.0 \n", | |
"4 5500.0 18 22 17450.0 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# now next step is Data Standardization\n", | |
"'''Why normalization?\n", | |
"\n", | |
"Normalization is the process of transforming values of several variables into a similar range. Typical normalizations include scaling the variable so the variable average is 0, scaling the variable so the variance is 1, or scaling variable so the variable values range from 0 to 1\n", | |
"\n", | |
"Example\n", | |
"\n", | |
"To demonstrate normalization, let's say we want to scale the columns \"length\", \"width\" and \"height\"\n", | |
"\n", | |
"Target:would like to Normalize those variables so their value ranges from 0 to 1.\n", | |
"\n", | |
"Approach: replace original value by (original value)/(maximum value)'''\n", | |
"\n", | |
"# replace (original value) by (original value)/(maximum value)\n", | |
"df['length'] = df['length']/df['length'].max()\n", | |
"df['width'] = df['width']/df['width'].max()\n", | |
"df[\"height\"]=df[\"height\"]/df[\"height\"].max()\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" engine-size fuel-system bore stroke compression-ratio horsepower \\\n", | |
"0 130 mpfi 3.47 2.68 9.0 111 \n", | |
"1 130 mpfi 3.47 2.68 9.0 111 \n", | |
"2 152 mpfi 2.68 3.47 9.0 154 \n", | |
"3 109 mpfi 3.19 3.40 10.0 102 \n", | |
"4 136 mpfi 3.19 3.40 8.0 115 \n", | |
"\n", | |
" peak-rpm city-mpg highway-mpg price \n", | |
"0 5000.0 21 27 13495.0 \n", | |
"1 5000.0 21 27 16500.0 \n", | |
"2 5000.0 19 26 16500.0 \n", | |
"3 5500.0 24 30 13950.0 \n", | |
"4 5500.0 18 22 17450.0 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# nest step is Binning\n", | |
"#Why binning?\n", | |
"#Binning is a process of transforming continuous numerical variables into discrete categorical 'bins', for grouped analysis.\n", | |
"'''\n", | |
"Example:\n", | |
"\n", | |
"In our dataset, \"horsepower\" is a real valued variable ranging from 48 to 288, it has 57 unique values. What if we only\n", | |
"care about the price difference between cars with high horsepower, medium horsepower, and little horsepower\n", | |
"(3 types)? Can we rearrange them into three ‘bins' to simplify analysis?\n", | |
"\n", | |
"We will use the Pandas method 'cut' to segment the 'horsepower' column into 3 bins'''\n", | |
"\n", | |
"\n", | |
"\n", | |
"#Example of Binning Data In Pandas\n", | |
"#Convert data to correct format\n", | |
"\n", | |
"df[\"horsepower\"]=df[\"horsepower\"].astype(int, copy=True)\n", | |
"df.head()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Text(0.5, 1.0, 'horsepower-bins')" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEWCAYAAABhffzLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAVUklEQVR4nO3df7RdZX3n8feHgKDyuwkYAQ0y1FW0FjVFLSNSqD9xGsapP5ihk1EsOqNW19S20XY62i4tUGurjmtcOFrjb6kWYeosgYkSxB9osIggWigGVCIJiIrOjAJ+54/9pBySe8MFs++5uc/7tdZZZ+9n73329z4cPvfJPvs8N1WFJKkfu027AEnS/DL4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BrapJsTPIb065joUtycZIXz7LtYUl+lGTJfNelXdfu0y5A0v1XVTcCe0+7Du1aHPFrl5Zk0QxeFtPPooXN4Ne0HZ3kyiQ/SPKRJHsBJPmdJNcl+V6S85M8dOsBSSrJy5JcC1ybwV8l2dxe58okj2777pnkTUluTHJzknckeWDbdnySbyd5bZJb2qWnfzdxnv2SvDfJliQ3JPnjJLu1bTckeXxbPrXVdFRbf3GSj7fl3ZKsSfJPSW5Nck6SA9u2Fe2405LcCHxqB/10RJIvtp/vvBleY/e2fnGSP0vy2SS3J7kwydK2ba8k7291fD/Jl5IcvFP+K2qXYvBr2p4HPAM4HHgM8B+SnAD8edu2HLgB+PA2x50MPAE4CngacBzwi8D+wPOBW9t+Z7b2o4F/ARwC/MnE6zwEWNraVwNnJ3lk2/Y2YD/gEcBTgH8PvLBtWw8c35aPA65v+2xdX9+Wf7fV+hTgocBtwNu3+VmeAvwS8PSZuwjauV/UXuNO4K072PfftjoPAh4AvLq1r24/z2HALwAvBf7vDl5Hi1VV+fAxlQewETh1Yv0s4B3Au4CzJtr3Bu4AVrT1Ak6Y2H4C8I/AE4HdJtoD/Bg4YqLtScA32/LxDCH64Int5wD/BVgC/AQ4amLbS4CL2/JpwPlt+RrgxcCH2/oNwOMmtp048RrL28+yO7Ci/SyPuJd+uhg4Y2L9KOCnrcatr7H7xL5/PLHvfwI+2ZZfBHwOeMy0/9v7mO7DEb+m7bsTy/+HIeQfyhCeAFTVjxhG8IdM7Putie2fAv4bw0j65iRnJ9kXWAY8CLi8Xdr4PvDJ1r7VbVX144n1G9r5lzKMlm/YZtvWGtYDT07yEIYA/ghwbJIVDKPqK9p+DwfOnTj/NcBdwOQlln/+WdqlqB+1x2tn2qfVsUercSYz9SnA+4ALgA8nuSnJWUn2mOU1tIgZ/FqIbmIITACSPJjh0sR3Jva5x7SyVfXWqno88CiGSzu/D9zCcCnjUVW1f3vsV1WTd8Ec0F5/q4e189/CMDJ/+DbbvtPOdx1DqP4ucElV3c4QuKcDl1bVz9ox3wKeOXH+/atqr6qa8WepqpdW1d7t8caJfQ7bpo47Wo1zVlV3VNXrq+oo4NeAZzNcQlJnDH4tRB8EXpjk6CR7Am8ELquqjTPtnORXkzyhjV5/DPw/4K4Wvu8E/irJQW3fQ5Jsey399UkekOTJDGH4t1V1F8Nlnzck2SfJw4H/DLx/4rj1wMu5+3r+xdusw3Dp6g3teJIsS7LqfvTJqUmOSvIg4E+Bj7Ya5yzJryf55XbP/w8Zfnncp9fQ4mDwa8GpqnUM19k/BmwCjgBesIND9mUI+NsYLoPcCrypbftD4DrgC0l+CPxv4JETx363HXcT8AHgpVX19bbtFQy/SK4HLmX4hfTuiWPXA/sAl8yyDvAW4HzgwiS3A19g+FD6vnof8J5W714M/9K4rx4CfJQh9K9p9b5/h0doUUqVf4hFfUpyPPD+qjp02rVI88kRvyR1xuCXpM54qUeSOjPq3CBJNgK3M9w5cGdVrWxfNf8IwxdPNgLPq6rbxqxDknS3UUf8LfhXVtUtE21nAd+rqjOSrAEOqKo/3NHrLF26tFasWDFanZK0GF1++eW3VNWybdunMRvgKu6e42Qtw73POwz+FStWsGHDhnGrkqRFJskNM7WP/eFuMdy/fHmS01vbwVW1CaA9HzTTgUlOT7IhyYYtW7aMXKYk9WPsEf+xVXVT+9bkRUm+fq9HNFV1NnA2wMqVK/0EWpJ2klFH/FV1U3veDJwLHMMwidZygPa8ecwaJEn3NFrwJ3lwkn22LjPMmX4Vw9fXV7fdVgPnjVWDJGl7Y17qOZhhOtqt5/lgVX0yyZeAc5KcBtwIPHfEGiRJ2xgt+KvqeuBXZmi/FThxrPNKknbMKRskqTMGvyR1xuCXpM5M45u7GtmKNZ+Yynk3nnHSVM4r6b5xxC9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4JekzjhJ24imNVmaJO2II35J6ozBL0mdMfglqTOL/hq/19kl6Z4c8UtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHVm9OBPsiTJPyT5+7Z+YJKLklzbng8YuwZJ0t3mY8T/SuCaifU1wLqqOhJY19YlSfNk1OBPcihwEvA/JppXAWvb8lrg5DFrkCTd09gj/r8G/gD42UTbwVW1CaA9HzTTgUlOT7IhyYYtW7aMXKYk9WO04E/ybGBzVV1+f46vqrOramVVrVy2bNlOrk6S+jXmn148FvjNJM8C9gL2TfJ+4OYky6tqU5LlwOYRa5AkbWO0EX9VvaaqDq2qFcALgE9V1anA+cDqtttq4LyxapAkbW8a9/GfATw1ybXAU9u6JGmejHmp559V1cXAxW35VuDE+TivJGl7fnNXkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmdGC/4keyX5YpKvJLk6yetb+4FJLkpybXs+YKwaJEnbG3PE/xPghKr6FeBo4BlJngisAdZV1ZHAurYuSZonowV/DX7UVvdojwJWAWtb+1rg5LFqkCRtb9Rr/EmWJLkC2AxcVFWXAQdX1SaA9nzQLMeenmRDkg1btmwZs0xJ6sqowV9Vd1XV0cChwDFJHn0fjj27qlZW1cply5aNV6QkdWZe7uqpqu8DFwPPAG5OshygPW+ejxokSYMx7+pZlmT/tvxA4DeArwPnA6vbbquB88aqQZK0vd1HfO3lwNokSxh+wZxTVX+f5PPAOUlOA24EnjtiDZKkbYwW/FV1JfDYGdpvBU4c67ySpB3zm7uS1BmDX5I6Y/BLUmcMfknqzJyCP8m6ubRJkha+Hd7Vk2Qv4EHA0jaLZtqmfYGHjlybJGkE93Y750uAVzGE/OXcHfw/BN4+Yl2SpJHsMPir6i3AW5K8oqreNk81SZJGNKcvcFXV25L8GrBi8piqeu9IdUmSRjKn4E/yPuAI4ArgrtZcgMEvSbuYuU7ZsBI4qqpqzGIkSeOb6338VwEPGbMQSdL8mOuIfynwtSRfZPhbugBU1W+OUpUkaTRzDf7XjVmEJGn+zPWunvVjFyJJmh9zvavndoa7eAAeAOwB/Liq9h2rMEnSOOY64t9ncj3JycAxo1QkSRrV/Zqds6o+Dpywk2uRJM2DuV7qec7E6m4M9/V7T78k7YLmelfPv5pYvhPYCKza6dVIkkY312v8Lxy7EO36Vqz5xNTOvfGMk6Z2bmlXM9c/xHJoknOTbE5yc5KPJTl07OIkSTvfXD/c/RvgfIZ5+Q8B/mdrkyTtYuYa/Muq6m+q6s72eA+wbMS6JEkjmWvw35Lk1CRL2uNU4NYxC5MkjWOuwf8i4HnAd4FNwG8BfuArSbugud7O+WfA6qq6DSDJgcCbGH4hSJJ2IXMd8T9ma+gDVNX3gMeOU5IkaUxzDf7dkhywdaWN+Of6rwVJ0gIy1/D+S+BzST7KMFXD84A3jFaVJGk0c/3m7nuTbGCYmC3Ac6rqa6NWJkkaxZwv17SgN+wlaRd3v6ZlliTtugx+SerMaMGf5LAkn05yTZKrk7yytR+Y5KIk17bnA+7ttSRJO8+YI/47gd+rql8Cngi8LMlRwBpgXVUdCaxr65KkeTJa8FfVpqr6clu+HbiGYWbPVcDattta4OSxapAkbW9ervEnWcHwTd/LgIOrahMMvxyAg+ajBknSYPTgT7I38DHgVVX1w/tw3OlJNiTZsGXLlvEKlKTOjBr8SfZgCP0PVNXfteabkyxv25cDm2c6tqrOrqqVVbVy2TKn/peknWXMu3oCvAu4pqrePLHpfGB1W14NnDdWDZKk7Y050dqxwG8DX01yRWt7LXAGcE6S04AbgeeOWIMkaRujBX9VXcowr89MThzrvJKkHfObu5LUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqzJizc0qL3oo1n5jauTeecdLUzq1dmyN+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I64338WhSmeT+9tKtxxC9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktSZ0YI/ybuTbE5y1UTbgUkuSnJtez5grPNLkmY25oj/PcAztmlbA6yrqiOBdW1dkjSPRgv+qroE+N42zauAtW15LXDyWOeXJM1svq/xH1xVmwDa80HzfH5J6t6C/XA3yelJNiTZsGXLlmmXI0mLxnwH/81JlgO0582z7VhVZ1fVyqpauWzZsnkrUJIWu/kO/vOB1W15NXDePJ9fkro35u2cHwI+DzwyybeTnAacATw1ybXAU9u6JGke7T7WC1fVKbNsOnGsc0qS7t2C/XBXkjQOg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOjPaHWCSNa8WaT0zlvBvPOGkq59XO44hfkjpj8EtSZwx+SeqMwS9JnfHDXUn3ybQ+VAY/WN5ZHPFLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdca4eSboXi21+Ikf8ktSZqQR/kmck+UaS65KsmUYNktSreQ/+JEuAtwPPBI4CTkly1HzXIUm9msaI/xjguqq6vqp+CnwYWDWFOiSpS9P4cPcQ4FsT698GnrDtTklOB05vqz9K8o2dcO6lwC074XUWI/tmdvbN7Oa1b3LmfJ1pp9gpffNz/swPn6lxGsGfGdpqu4aqs4Gzd+qJkw1VtXJnvuZiYd/Mzr6ZnX0zu4XcN9O41PNt4LCJ9UOBm6ZQhyR1aRrB/yXgyCSHJ3kA8ALg/CnUIUldmvdLPVV1Z5KXAxcAS4B3V9XV83T6nXrpaJGxb2Zn38zOvpndgu2bVG13eV2StIj5zV1J6ozBL0mdWbTBn2Rjkq8muSLJhtZ2YJKLklzbng+Ydp3zJcm7k2xOctVE26z9keQ1bUqNbyR5+nSqnh+z9M3rknynvX+uSPKsiW099c1hST6d5JokVyd5ZWvv/r2zg75Z+O+dqlqUD2AjsHSbtrOANW15DXDmtOucx/44DngccNW99QfDVBpfAfYEDgf+CVgy7Z9hnvvmdcCrZ9i3t75ZDjyuLe8D/GPrg+7fOzvomwX/3lm0I/5ZrALWtuW1wMlTrGVeVdUlwPe2aZ6tP1YBH66qn1TVN4HrGKbaWJRm6ZvZ9NY3m6rqy235duAahm/fd//e2UHfzGbB9M1iDv4CLkxyeZv+AeDgqtoEw3804KCpVbcwzNYfM02rsaM39GL18iRXtktBWy9ldNs3SVYAjwUuw/fOPWzTN7DA3zuLOfiPrarHMcwC+rIkx027oF3InKbVWOT+O3AEcDSwCfjL1t5l3yTZG/gY8Kqq+uGOdp2hbVH3zwx9s+DfO4s2+Kvqpva8GTiX4Z9UNydZDtCeN0+vwgVhtv7oflqNqrq5qu6qqp8B7+Tuf5J31zdJ9mAItg9U1d+1Zt87zNw3u8J7Z1EGf5IHJ9ln6zLwNOAqhqkhVrfdVgPnTafCBWO2/jgfeEGSPZMcDhwJfHEK9U3N1lBr/jXD+wc665skAd4FXFNVb57Y1P17Z7a+2SXeO9P+ZHykT9sfwfDp+VeAq4E/au2/AKwDrm3PB0671nnskw8x/LPzDoaRx2k76g/gjxjuOvgG8Mxp1z+Fvnkf8FXgSob/YZd32jf/kuFyxJXAFe3xLN87O+ybBf/eccoGSerMorzUI0mancEvSZ0x+CWpMwa/JHXG4Jekzhj8WlSSrJicZVPS9gx+qUky73+K9P7YVerUwmXwazFakuSdbY70C5M8MMnRSb7QJs46d+vEWUkuTvLGJOuBVyZ5bpKrknwlySVtnyVJ/iLJl9rxL2ntxye5pL3e15K8I8lubdspGf4exFVJzmxtz0vy5rb8yiTXt+Ujklzalh+fZH2bXPCCiWkR7lHn/HanFhtHDlqMjgROqarfSXIO8G+APwBeUVXrk/wp8F+BV7X996+qpwAk+Srw9Kr6TpL92/bTgB9U1a8m2RP4bJIL27ZjGOZZvwH4JPCcJJ8DzgQeD9zGMEvsycAlwO+3454M3JrkEIZvgH6mzfvyNmBVVW1J8nzgDcCLtq1T+nkY/FqMvllVV7TlyxlmSty/qta3trXA307s/5GJ5c8C72m/MLZOSPY04DFJfqut78fwy+WnwBerauvI/UMMIX4HcHFVbWntHwCOq6qPJ9m7zSN1GPBBhj8C8+R2rkcCjwYuGqaBYQnDVBIz1Sndbwa/FqOfTCzfBew/247Nj7cuVNVLkzwBOAm4IsnRDNPpvqKqLpg8KMnxbD+tbjHz9LtbfR54IcNcLZ9hGM0/Cfg94GHA1VX1pHurU/p5eI1fPfgBcFuSJ7f13wbWz7RjkiOq6rKq+hPgFoaR+QXAf2yXYkjyi23WV4Bjkhzeru0/H7iU4Y9xPCXJ0iRLgFMmzncJ8Or2/A/ArwM/qaofMPwyWJbkSe08eyR51M7rBmngiF+9WA28I8mDgOsZRt0z+YskRzKM2tcxzPB6JbAC+HKbincLd/+pwc8DZwC/zBDm51bVz5K8Bvh0e53/VVVbpy3+DMMvk0uq6q4k3wK+DlBVP22Xk96aZD+G/z//mmGGWWmncXZO6X5ql3peXVXPnnYt0n3hpR5J6owjfknqjCN+SeqMwS9JnTH4JakzBr8kdcbgl6TO/H8l7wm9M0nm8AAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"#Lets plot the histogram of horspower, to see what the distribution of horsepower looks like.\n", | |
"'''%matplotlib inline\n", | |
"import matplotlib as plt\n", | |
"from matplotlib import pyplot\n", | |
"plt.pyplot.hist(df[\"horsepower\"])\n", | |
"\n", | |
"# set x/y labels and plot title\n", | |
"plt.pyplot.xlabel(\"horsepower\")\n", | |
"plt.pyplot.ylabel(\"count\")\n", | |
"plt.pyplot.title(\"horsepower bins\")\n", | |
"\n", | |
"'''\n", | |
"%matplotlib inline\n", | |
"import matplotlib as plt\n", | |
"from matplotlib import pyplot\n", | |
"plt.pyplot.hist(df[\"horsepower\"])\n", | |
"# set x/y labels and plot title\n", | |
"plt.pyplot.xlabel('horsepower')\n", | |
"plt.pyplot.ylabel('count')\n", | |
"plt.pyplot.title('horsepower-bins')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>engine-size</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>130</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>152</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>109</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>136</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" engine-size fuel-system bore stroke compression-ratio horsepower \\\n", | |
"0 130 mpfi 3.47 2.68 9.0 111 \n", | |
"1 130 mpfi 3.47 2.68 9.0 111 \n", | |
"2 152 mpfi 2.68 3.47 9.0 154 \n", | |
"3 109 mpfi 3.19 3.40 10.0 102 \n", | |
"4 136 mpfi 3.19 3.40 8.0 115 \n", | |
"\n", | |
" peak-rpm city-mpg highway-mpg price \n", | |
"0 5000.0 21 27 13495.0 \n", | |
"1 5000.0 21 27 16500.0 \n", | |
"2 5000.0 19 26 16500.0 \n", | |
"3 5500.0 24 30 13950.0 \n", | |
"4 5500.0 18 22 17450.0 \n", | |
"\n", | |
"[5 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# step next \n", | |
"# Data Standardization of our data sets\n", | |
"\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>fuel-system</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>mpfi</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 27 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg \\\n", | |
"0 mpfi 3.47 2.68 9.0 111 5000.0 21 \n", | |
"1 mpfi 3.47 2.68 9.0 111 5000.0 21 \n", | |
"2 mpfi 2.68 3.47 9.0 154 5000.0 19 \n", | |
"3 mpfi 3.19 3.40 10.0 102 5500.0 24 \n", | |
"4 mpfi 3.19 3.40 8.0 115 5500.0 18 \n", | |
"\n", | |
" highway-mpg price city-L/100km \n", | |
"0 27 13495.0 11.190476 \n", | |
"1 27 16500.0 11.190476 \n", | |
"2 26 16500.0 12.368421 \n", | |
"3 30 13950.0 9.791667 \n", | |
"4 22 17450.0 13.055556 \n", | |
"\n", | |
"[5 rows x 27 columns]" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'''What is Standardization?\n", | |
"\n", | |
"Standardization is the process of transforming data into a common format which allows the researcher to make the meaningful comparison.\n", | |
"\n", | |
"Example\n", | |
"\n", | |
"Transform mpg to L/100km:\n", | |
"\n", | |
"In our dataset, the fuel consumption columns \"city-mpg\" and \"highway-mpg\" are represented by mpg (miles per gallon) unit. Assume we are developing an application in a country that accept the fuel consumption with L/100km standard\n", | |
"\n", | |
"We will need to apply data transformation to transform mpg into L/100km?\n", | |
"\n", | |
"The formula for unit conversion is\n", | |
"\n", | |
"L/100km = 235 / mpg'''\n", | |
"\n", | |
"# Convert mpg to L/100km by mathematical operation (235 divided by mpg)\n", | |
"df['city-L/100km'] = 235/df[\"city-mpg\"]\n", | |
"\n", | |
"# check your transformed data \n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"1 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"2 2.68 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.19 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.19 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km \n", | |
"0 13495.0 11.190476 8.703704 \n", | |
"1 16500.0 11.190476 8.703704 \n", | |
"2 16500.0 12.368421 9.038462 \n", | |
"3 13950.0 9.791667 7.833333 \n", | |
"4 17450.0 13.055556 10.681818 \n", | |
"\n", | |
"[5 rows x 28 columns]" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'''Question #2: \n", | |
"According to the example above, transform mpg to L/100km in the column of \"highway-mpg\", and change the name of column to \"highway-L/100km\"'''\n", | |
"\n", | |
"df[\"highway-L/100km\"]=235/df[\"highway-mpg\"]\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# step : Data Normalization\n", | |
"\n", | |
"'''Why normalization?\n", | |
"\n", | |
"Normalization is the process of transforming values of several variables into a similar range. Typical normalizations include scaling the variable so the variable average is 0, scaling the variable so the variance is 1, or scaling variable so the variable values range from 0 to 1\n", | |
"\n", | |
"Example\n", | |
"\n", | |
"To demonstrate normalization, let's say we want to scale the columns \"length\", \"width\" and \"height\"\n", | |
"\n", | |
"Target:would like to Normalize those variables so their value ranges from 0 to 1.\n", | |
"\n", | |
"Approach: replace original value by (original value)/(maximum value)'''\n", | |
"\n", | |
"# replace (original value) by (original value)/(maximum value)\n", | |
"df['length'] = df['length']/df['length'].max()\n", | |
"df['width'] = df['width']/df['width'].max()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"1 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"2 2.68 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.19 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.19 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km \n", | |
"0 13495.0 11.190476 8.703704 \n", | |
"1 16500.0 11.190476 8.703704 \n", | |
"2 16500.0 12.368421 9.038462 \n", | |
"3 13950.0 9.791667 7.833333 \n", | |
"4 17450.0 13.055556 10.681818 \n", | |
"\n", | |
"[5 rows x 28 columns]" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"1 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"2 2.68 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.19 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.19 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km \n", | |
"0 13495.0 11.190476 8.703704 \n", | |
"1 16500.0 11.190476 8.703704 \n", | |
"2 16500.0 12.368421 9.038462 \n", | |
"3 13950.0 9.791667 7.833333 \n", | |
"4 17450.0 13.055556 10.681818 \n", | |
"\n", | |
"[5 rows x 28 columns]" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Questiont #3: \n", | |
"#According to the example above, normalize the column \"height\".\n", | |
"\n", | |
"df[\"height\"]=df[\"height\"]/df[\"height\"].max()\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# step : Binning\n", | |
"'''Why binning?\n", | |
"Binning is a process of transforming continuous numerical variables into discrete categorical 'bins', for grouped analysis.\n", | |
"\n", | |
"Example:\n", | |
"\n", | |
"In our dataset, \"horsepower\" is a real valued variable ranging from 48 to 288, it has 57 unique values. What if we only care about the price difference between cars with high horsepower, medium horsepower, and little horsepower (3 types)? Can we rearrange them into three ‘bins' to simplify analysis?\n", | |
"\n", | |
"We will use the Pandas method 'cut' to segment the 'horsepower' column into 3 bins\n", | |
"\n", | |
"Example of Binning Data In Pandas\n", | |
"Convert data to correct format'''\n", | |
"\n", | |
"df[\"horsepower\"]=df[\"horsepower\"].astype(int, copy=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>bore</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.19</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 28 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"1 3.47 2.68 9.0 111 5000.0 21 27 \n", | |
"2 2.68 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.19 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.19 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km \n", | |
"0 13495.0 11.190476 8.703704 \n", | |
"1 16500.0 11.190476 8.703704 \n", | |
"2 16500.0 12.368421 9.038462 \n", | |
"3 13950.0 9.791667 7.833333 \n", | |
"4 17450.0 13.055556 10.681818 \n", | |
"\n", | |
"[5 rows x 28 columns]" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Text(0.5, 1.0, 'horsepower bins')" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEWCAYAAABhffzLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAVC0lEQVR4nO3de7SldX3f8feHAUXlHgYcwThIiStoDeoEtRSkEBXBZqiJF1awU0OCdqnRVU0yJk1q2piCMSbGusrCahjjldQg09il0lEGSRQdDCKIBoKDF0ZmuChoWxT89o/nN2Uzc87hAPOcfeb83q+19trPdT/f/Zs9n/07z372b6eqkCT1Y49pFyBJWlgGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+TVWSzUl+Ydp1LHZJTkzy7TnWn5fk9xayJu2+9px2AZIevqp61bRr0O7DHr+WhCRLphOzlJ6LFieDX4vBMUmuTvL9JB9Jsvf2FUl+PckNSW5Psj7J4ybWVZJXJ7keuD6DP02ytT3W1Ume0rZ9ZJK3JflmklvaqZFHtXUnJvl2kt9Jcms7/fQrE8fZP8n7kmxLclOSf59kj7bupiTPaNNntpqObvO/luRjbXqPJGuT/GOS25JcmOSgtm5l2++sJN8EPj1bQ81R4wVJ/nCH5/OG1hZbkrxiYttTk3w1yV1JvpPkjQ/nH0+7H4Nfi8FLgFOAI4CnAv8GIMlJwH9u61cANwEf3mHf04FnAkcDzwNOAH4GOAB4KXBb2+7ctvwY4J8AhwG/P/E4jwUObsvXAOcneVJb905gf+CJwHOAfw1sD9KNwIlt+gTgxrbN9vmNbfo3Wq3PAR4H3AG8a4fn8hzgZ4Hnz9BGD1TjTNvu37Y9C3hXkgPbuvcAr6yqfYGnMMcbjZaoqvLmbWo3YDNw5sT8W4Hz2vR7gLdOrNsH+DGwss0XcNLE+pOAfwCeBewxsTzAD4EjJ5Y9G/hGmz4RuAd4zMT6C4HfA5YBdwNHT6x7JXBpmz4LWN+mrwN+Dfhwm78JePrEupMnHmNFey57Aivbc3niHO00a41t+gLgDye2/T/AnhPbbgWe1aa/2Z7DftP+9/c2nZs9fi0G352Y/t8MAQ9Dz/im7Suq6gcMPfjDJrb/1sT6TwP/haEnfUuS85PsBywHHg1cmeR7Sb4HfKIt3+6OqvrhxPxN7fgHA4+YrKNNb69hI3B8kscyvEl8BDguyUqGHvdVbbsnABdNHP864F7g0Jmeyyxmq3Emt1XVPRPzk+36S8CpwE1JNiZ59gMcV0uMwa/F7GaGwAQgyWOAnwK+M7HN/YaXrao/r6pnAE9mOLXzm8CtDD3gJ1fVAe22f1XtM7Hrge3xt/vpdvxbGXrmT9hh3Xfa8W5gCNXfAC6rqrsY3sjOBi6vqp+0fb4FvGDi+AdU1d5VNetzmcFsNT4oVfXFqloNHAJ8jOEvB3XE4Ndi9kHgFUmOSfJI4I+AK6pq80wbJ/n5JM9MshfDqZ3/C9zbwvfdwJ8mOaRte1iSHc+l/0GSRyQ5Hngh8FdVdS9DML4lyb5JngD8O+D9E/ttBF7DfefzL91hHuC89hhPaMdfnmT1Q2iTnWp8MDu3fX8lyf5V9WPgToa/PNQRg1+LVlVtYDjP/lFgC3Ak8LI5dtmPIeDvYDgNchvwtrbut4EbgM8nuRP4X8DkB6PfbfvdDHwAeFVVfa2tey3DG8mNwOUMb0jvndh3I7AvcNks8wDvANYDn0pyF/B5hg+lH4y5anwwXg5sbu3wKuDMh/AY2o2lyh9iUd+SnAi8v6oOn3Yt0kKwxy9JnTH4JakznuqRpM6MOiZIks3AXQxXDdxTVava19Q/wvCllc3AS6rqjjHrkCTdZ9Qefwv+VVV168SytwK3V9U5SdYCB1bVb8/1OAcffHCtXLlytDolaSm68sorb62q5Tsun8YogKu5b2yTdQzXPM8Z/CtXrmTTpk3jViVJS0ySm2ZaPvaHu8Vw3fKVSc5uyw6tqi0A7f6QmXZMcnaSTUk2bdu2beQyJakfY/f4j6uqm9u3JS9JMu8vm1TV+cD5AKtWrfITaEnaRUbt8VfVze1+K3ARcCzD4FkrANr91jFrkCTd32jBn+QxSfbdPs0wVvo1DF9bX9M2WwNcPFYNkqSdjXmq51CGYWi3H+eDVfWJJF8ELkxyFsO44C8esQZJ0g5GC/6quhH4uRmW3wacPNZxJUlzc8gGSeqMwS9JnTH4Jakz0/jmrka2cu3Hp3LczeecNpXjSnpw7PFLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMO0jaiaQ2WJklzsccvSZ0x+CWpMwa/JHVmyZ/j9zy7JN2fPX5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOjB78SZYl+fskf9PmD0pySZLr2/2BY9cgSbrPQvT4XwdcNzG/FthQVUcBG9q8JGmBjBr8SQ4HTgP+28Ti1cC6Nr0OOH3MGiRJ9zd2j//PgN8CfjKx7NCq2gLQ7g+ZacckZyfZlGTTtm3bRi5TkvoxWvAneSGwtaqufCj7V9X5VbWqqlYtX758F1cnSf0a86cXjwN+McmpwN7AfkneD9ySZEVVbUmyAtg6Yg2SpB2M1uOvqjdV1eFVtRJ4GfDpqjoTWA+saZutAS4eqwZJ0s6mcR3/OcBzk1wPPLfNS5IWyJinev6/qroUuLRN3wacvBDHlSTtzG/uSlJnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SerMaMGfZO8kX0jy5STXJvmDtvygJJckub7dHzhWDZKknY3Z478bOKmqfg44BjglybOAtcCGqjoK2NDmJUkLZLTgr8EP2uxe7VbAamBdW74OOH2sGiRJOxv1HH+SZUmuArYCl1TVFcChVbUFoN0fMsu+ZyfZlGTTtm3bxixTkroyavBX1b1VdQxwOHBskqc8iH3Pr6pVVbVq+fLl4xUpSZ1ZkKt6qup7wKXAKcAtSVYAtPutC1GDJGkw5lU9y5Mc0KYfBfwC8DVgPbCmbbYGuHisGiRJO9tzxMdeAaxLsozhDebCqvqbJJ8DLkxyFvBN4MUj1iBJ2sFowV9VVwNPm2H5bcDJYx1XkjQ3v7krSZ0x+CWpMwa/JHXG4Jekzswr+JNsmM8ySdLiN+dVPUn2Bh4NHNxG0UxbtR/wuJFrkySN4IEu53wl8HqGkL+S+4L/TuBdI9YlSRrJnMFfVe8A3pHktVX1zgWqSZI0onl9gauq3pnknwErJ/epqveNVJckaSTzCv4kfwkcCVwF3NsWF2DwS9JuZr5DNqwCjq6qGrMYSdL45nsd/zXAY8csRJK0MObb4z8Y+GqSLzD8li4AVfWLo1QlSRrNfIP/zWMWIUlaOPO9qmfj2IVIkhbGfK/quYvhKh6ARwB7AT+sqv3GKkySNI759vj3nZxPcjpw7CgVSZJG9ZBG56yqjwEn7eJaJEkLYL6nel40MbsHw3X9XtMvSbuh+V7V8y8npu8BNgOrd3k1kqTRzfcc/yvGLkS7v5VrPz61Y28+57SpHVva3cz3h1gOT3JRkq1Jbkny0SSHj12cJGnXm++Hu38BrGcYl/8w4H+0ZZKk3cx8g395Vf1FVd3TbhcAy0esS5I0kvkG/61JzkyyrN3OBG4bszBJ0jjmG/y/CrwE+C6wBfhlwA98JWk3NN/LOf8TsKaq7gBIchDwNoY3BEnSbmS+Pf6nbg99gKq6HXjaOCVJksY03+DfI8mB22daj3++fy1IkhaR+Yb3nwB/l+S/MwzV8BLgLaNVJUkazXy/ufu+JJsYBmYL8KKq+uqolUmSRjHv0zUt6A17SdrNPaRhmSVJuy+DX5I6M1rwJ3l8ks8kuS7JtUle15YflOSSJNe3+wMf6LEkSbvOmD3+e4A3VNXPAs8CXp3kaGAtsKGqjgI2tHlJ0gIZLfiraktVfalN3wVcxzCy52pgXdtsHXD6WDVIkna2IOf4k6xk+KbvFcChVbUFhjcH4JCFqEGSNBg9+JPsA3wUeH1V3fkg9js7yaYkm7Zt2zZegZLUmVGDP8leDKH/gar667b4liQr2voVwNaZ9q2q86tqVVWtWr7cof8laVcZ86qeAO8Brquqt0+sWg+sadNrgIvHqkGStLMxB1o7Dng58JUkV7VlvwOcA1yY5Czgm8CLR6xBkrSD0YK/qi5nGNdnJiePdVxJ0tz85q4kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOjPm6JzSkrdy7cenduzN55w2tWNr92aPX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzngdv5aEaV5PL+1u7PFLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1ZrTgT/LeJFuTXDOx7KAklyS5vt0fONbxJUkzG7PHfwFwyg7L1gIbquooYEOblyQtoNGCv6ouA27fYfFqYF2bXgecPtbxJUkzW+hz/IdW1RaAdn/IAh9fkrq3aD/cTXJ2kk1JNm3btm3a5UjSkrHQwX9LkhUA7X7rbBtW1flVtaqqVi1fvnzBCpSkpW6hg389sKZNrwEuXuDjS1L3xryc80PA54AnJfl2krOAc4DnJrkeeG6blyQtoD3HeuCqOmOWVSePdUxJ0gNbtB/uSpLGYfBLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUmdF+iEXSuFau/fhUjrv5nNOmclztOvb4JakzBr8kdcbgl6TOGPyS1Bk/3JX0oEzrQ2Xwg+VdxR6/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGcfqkaQHsNTGJ7LHL0mdmUrwJzklydeT3JBk7TRqkKReLXjwJ1kGvAt4AXA0cEaSoxe6Dknq1TR6/McCN1TVjVX1I+DDwOop1CFJXZrGh7uHAd+amP828MwdN0pyNnB2m/1Bkq/vgmMfDNy6Cx5nKbJtZmfbzG5B2ybnLtSRdold0jYP8zk/YaaF0wj+zLCsdlpQdT5w/i49cLKpqlbtysdcKmyb2dk2s7NtZreY22Yap3q+DTx+Yv5w4OYp1CFJXZpG8H8ROCrJEUkeAbwMWD+FOiSpSwt+qqeq7knyGuCTwDLgvVV17QIdfpeeOlpibJvZ2Tazs21mt2jbJlU7nV6XJC1hfnNXkjpj8EtSZ5Zs8CfZnOQrSa5KsqktOyjJJUmub/cHTrvOhZLkvUm2JrlmYtms7ZHkTW1Ija8nef50ql4Ys7TNm5N8p71+rkpy6sS6ntrm8Uk+k+S6JNcmeV1b3v1rZ462WfyvnapakjdgM3DwDsveCqxt02uBc6dd5wK2xwnA04FrHqg9GIbS+DLwSOAI4B+BZdN+DgvcNm8G3jjDtr21zQrg6W16X+AfWht0/9qZo20W/Wtnyfb4Z7EaWNem1wGnT7GWBVVVlwG377B4tvZYDXy4qu6uqm8ANzAMtbEkzdI2s+mtbbZU1Zfa9F3AdQzfvu/+tTNH28xm0bTNUg7+Aj6V5Mo2/APAoVW1BYZ/NOCQqVW3OMzWHjMNqzHXC3qpek2Sq9upoO2nMrptmyQrgacBV+Br5352aBtY5K+dpRz8x1XV0xlGAX11khOmXdBuZF7Daixx/xU4EjgG2AL8SVveZdsk2Qf4KPD6qrpzrk1nWLak22eGtln0r50lG/xVdXO73wpcxPAn1S1JVgC0+63Tq3BRmK09uh9Wo6puqap7q+onwLu570/y7tomyV4MwfaBqvrrttjXDjO3ze7w2lmSwZ/kMUn23T4NPA+4hmFoiDVtszXAxdOpcNGYrT3WAy9L8sgkRwBHAV+YQn1Tsz3Umn/F8PqBztomSYD3ANdV1dsnVnX/2pmtbXaL1860Pxkf6dP2JzJ8ev5l4Frgd9vynwI2ANe3+4OmXesCtsmHGP7s/DFDz+OsudoD+F2Gqw6+Drxg2vVPoW3+EvgKcDXDf9gVnbbNP2c4HXE1cFW7neprZ862WfSvHYdskKTOLMlTPZKk2Rn8ktQZg1+SOmPwS1JnDH5J6ozBryUlycrJUTYl7czgl5okC/5TpA/F7lKnFi+DX0vRsiTvbmOkfyrJo5Ick+TzbeCsi7YPnJXk0iR/lGQj8LokL05yTZIvJ7msbbMsyR8n+WLb/5Vt+YlJLmuP99Uk5yXZo607I8PvQVyT5Ny27CVJ3t6mX5fkxjZ9ZJLL2/Qzkmxsgwt+cmJYhPvVubDNqaXGnoOWoqOAM6rq15NcCPwS8FvAa6tqY5L/CPwH4PVt+wOq6jkASb4CPL+qvpPkgLb+LOD7VfXzSR4J/G2ST7V1xzKMs34T8AngRUn+DjgXeAZwB8MosacDlwG/2fY7HrgtyWEM3wD9bBv35Z3A6qraluSlwFuAX92xTunhMPi1FH2jqq5q01cyjJR4QFVtbMvWAX81sf1HJqb/FrigvWFsH5DsecBTk/xym9+f4c3lR8AXqmp7z/1DDCH+Y+DSqtrWln8AOKGqPpZknzaO1OOBDzL8CMzx7VhPAp4CXDIMA8MyhqEkZqpTesgMfi1Fd09M3wscMNuGzQ+3T1TVq5I8EzgNuCrJMQzD6b62qj45uVOSE9l5WN1i5uF3t/sc8AqGsVo+y9CbfzbwBuCngWur6tkPVKf0cHiOXz34PnBHkuPb/MuBjTNtmOTIqrqiqn4fuJWhZ/5J4N+2UzEk+Zk26ivAsUmOaOf2XwpczvBjHM9JcnCSZcAZE8e7DHhju/974F8Ad1fV9xneDJYneXY7zl5JnrzrmkEa2ONXL9YA5yV5NHAjQ697Jn+c5CiGXvsGhhFerwZWAl9qQ/Fu476fGvwccA7wTxnC/KKq+kmSNwGfaY/zP6tq+7DFn2V4M7msqu5N8i3gawBV9aN2OunPk+zP8P/zzxhGmJV2GUfnlB6idqrnjVX1wmnXIj0YnuqRpM7Y45ekztjjl6TOGPyS1BmDX5I6Y/BLUmcMfknqzP8DYRcBLtUxGX8AAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Lets plot the histogram of horspower, to see what the distribution of horsepower looks like.\n", | |
"%matplotlib inline\n", | |
"import matplotlib as plt\n", | |
"from matplotlib import pyplot\n", | |
"plt.pyplot.hist(df[\"horsepower\"])\n", | |
"\n", | |
"# set x/y labels and plot title\n", | |
"plt.pyplot.xlabel(\"horsepower\")\n", | |
"plt.pyplot.ylabel(\"count\")\n", | |
"plt.pyplot.title(\"horsepower bins\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 48. , 119.33333333, 190.66666667, 262. ])" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'''We would like 3 bins of equal size bandwidth so we use numpy's linspace(start_value, end_value, numbers_generated function.\n", | |
"\n", | |
"Since we want to include the minimum value of horsepower we want to set start_value=min(df[\"horsepower\"]).\n", | |
"\n", | |
"Since we want to include the maximum value of horsepower we want to set end_value=max(df[\"horsepower\"]).\n", | |
"\n", | |
"Since we are building 3 bins of equal length, there should be 4 dividers, so numbers_generated=4.\n", | |
"\n", | |
"We build a bin array, with a minimum value to a maximum value, with bandwidth calculated above. The bins will be\n", | |
"values used to determine when one bin ends and another begins.'''\n", | |
"\n", | |
"\n", | |
"bins = np.linspace(min(df[\"horsepower\"]), max(df[\"horsepower\"]), 4)\n", | |
"bins" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"group_names = ['Low', 'Medium', 'High']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>horsepower</th>\n", | |
" <th>horsepower-binned</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>111</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>111</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>154</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>102</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>115</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>110</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>110</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>110</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>140</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>101</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>101</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>121</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>121</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>121</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>182</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>182</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>182</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>48</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>70</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>70</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" horsepower horsepower-binned\n", | |
"0 111 Low\n", | |
"1 111 Low\n", | |
"2 154 Medium\n", | |
"3 102 Low\n", | |
"4 115 Low\n", | |
"5 110 Low\n", | |
"6 110 Low\n", | |
"7 110 Low\n", | |
"8 140 Medium\n", | |
"9 101 Low\n", | |
"10 101 Low\n", | |
"11 121 Medium\n", | |
"12 121 Medium\n", | |
"13 121 Medium\n", | |
"14 182 Medium\n", | |
"15 182 Medium\n", | |
"16 182 Medium\n", | |
"17 48 Low\n", | |
"18 70 Low\n", | |
"19 70 Low" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# We apply the function \"cut\" the determine what each value of \"df['horsepower']\" belongs to.\n", | |
"\n", | |
"df['horsepower-binned'] = pd.cut(df['horsepower'], bins, labels=group_names, include_lowest=True )\n", | |
"df[['horsepower','horsepower-binned']].head(20) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Low 153\n", | |
"Medium 43\n", | |
"High 5\n", | |
"Name: horsepower-binned, dtype: int64" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Lets see the number of vehicles in each bin.\n", | |
"\n", | |
"df[\"horsepower-binned\"].value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Text(0.5, 1.0, 'horsepower bins')" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"#Lets plot the distribution of each bin.\n", | |
"\n", | |
"%matplotlib inline\n", | |
"import matplotlib as plt\n", | |
"from matplotlib import pyplot\n", | |
"pyplot.bar(group_names, df[\"horsepower-binned\"].value_counts())\n", | |
"\n", | |
"# set x/y labels and plot title\n", | |
"plt.pyplot.xlabel(\"horsepower\")\n", | |
"plt.pyplot.ylabel(\"count\")\n", | |
"plt.pyplot.title(\"horsepower bins\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" <th>horsepower-binned</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 29 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 2.68 9.0 111 5000.0 21 27 \n", | |
"1 2.68 9.0 111 5000.0 21 27 \n", | |
"2 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km horsepower-binned \n", | |
"0 13495.0 11.190476 8.703704 Low \n", | |
"1 16500.0 11.190476 8.703704 Low \n", | |
"2 16500.0 12.368421 9.038462 Medium \n", | |
"3 13950.0 9.791667 7.833333 Low \n", | |
"4 17450.0 13.055556 10.681818 Low \n", | |
"\n", | |
"[5 rows x 29 columns]" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#Check the dataframe above carefully, you will find the last column provides the bins for \"horsepower\" with 3 categories (\"Low\",\"Medium\" and \"High\").\n", | |
"\n", | |
"#We successfully narrow the intervals from 57 to 3\n", | |
"\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Text(0.5, 1.0, 'horsepower bins')" | |
] | |
}, | |
"execution_count": 56, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"#Bins visualization\n", | |
"#Normally, a histogram is used to visualize the distribution of bins we created above.\n", | |
"\n", | |
"%matplotlib inline\n", | |
"import matplotlib as plt\n", | |
"from matplotlib import pyplot\n", | |
"\n", | |
"a = (0,1,2)\n", | |
"\n", | |
"# draw historgram of attribute \"horsepower\" with bins = 3\n", | |
"plt.pyplot.hist(df[\"horsepower\"], bins = 3)\n", | |
"\n", | |
"# set x/y labels and plot title\n", | |
"plt.pyplot.xlabel(\"horsepower\")\n", | |
"plt.pyplot.ylabel(\"count\")\n", | |
"plt.pyplot.title(\"horsepower bins\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# The plot above shows the binning result for attribute \"horsepower\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',\n", | |
" 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',\n", | |
" 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',\n", | |
" 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',\n", | |
" 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',\n", | |
" 'highway-mpg', 'price', 'city-L/100km', 'highway-L/100km',\n", | |
" 'horsepower-binned'],\n", | |
" dtype='object')" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'''Indicator variable (or dummy variable)\n", | |
"What is an indicator variable?\n", | |
"An indicator variable (or dummy variable) is a numerical variable used to label categories. They are called 'dummies' because the numbers themselves don't have inherent meaning.\n", | |
"\n", | |
"Why we use indicator variables?\n", | |
"\n", | |
"So we can use categorical variables for regression analysis in the later modules.\n", | |
"\n", | |
"Example\n", | |
"We see the column \"fuel-type\" has two unique values, \"gas\" or \"diesel\". Regression doesn't understand words, only numbers. To use this attribute in regression analysis, we convert \"fuel-type\" into indicator variables.\n", | |
"\n", | |
"We will use the panda's method 'get_dummies' to assign numerical values to different categories of fuel type.'''\n", | |
"\n", | |
"\n", | |
"df.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#get indicator variables and assign it to data frame \"dummy_variable_1\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>diesel</th>\n", | |
" <th>gas</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" diesel gas\n", | |
"0 0 1\n", | |
"1 0 1\n", | |
"2 0 1\n", | |
"3 0 1\n", | |
"4 0 1" | |
] | |
}, | |
"execution_count": 61, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dummy_variable_1 = pd.get_dummies(df[\"fuel-type\"])\n", | |
"dummy_variable_1.head()\n", | |
"#change column names for clarity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>diesel</th>\n", | |
" <th>gas</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" diesel gas\n", | |
"0 0 1\n", | |
"1 0 1\n", | |
"2 0 1\n", | |
"3 0 1\n", | |
"4 0 1" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dummy_variable_1.rename(columns={'fuel-type-diesel':'gas', 'fuel-type-diesel':'diesel'}, inplace=True)\n", | |
"dummy_variable_1.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>symboling</th>\n", | |
" <th>normalized-losses</th>\n", | |
" <th>make</th>\n", | |
" <th>fuel-type</th>\n", | |
" <th>aspiration</th>\n", | |
" <th>num-of-doors</th>\n", | |
" <th>body-style</th>\n", | |
" <th>drive-wheels</th>\n", | |
" <th>engine-location</th>\n", | |
" <th>wheel-base</th>\n", | |
" <th>...</th>\n", | |
" <th>stroke</th>\n", | |
" <th>compression-ratio</th>\n", | |
" <th>horsepower</th>\n", | |
" <th>peak-rpm</th>\n", | |
" <th>city-mpg</th>\n", | |
" <th>highway-mpg</th>\n", | |
" <th>price</th>\n", | |
" <th>city-L/100km</th>\n", | |
" <th>highway-L/100km</th>\n", | |
" <th>horsepower-binned</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>13495.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>convertible</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>88.6</td>\n", | |
" <td>...</td>\n", | |
" <td>2.68</td>\n", | |
" <td>9.0</td>\n", | |
" <td>111</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>21</td>\n", | |
" <td>27</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>11.190476</td>\n", | |
" <td>8.703704</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>122</td>\n", | |
" <td>alfa-romero</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>two</td>\n", | |
" <td>hatchback</td>\n", | |
" <td>rwd</td>\n", | |
" <td>front</td>\n", | |
" <td>94.5</td>\n", | |
" <td>...</td>\n", | |
" <td>3.47</td>\n", | |
" <td>9.0</td>\n", | |
" <td>154</td>\n", | |
" <td>5000.0</td>\n", | |
" <td>19</td>\n", | |
" <td>26</td>\n", | |
" <td>16500.0</td>\n", | |
" <td>12.368421</td>\n", | |
" <td>9.038462</td>\n", | |
" <td>Medium</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>fwd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.8</td>\n", | |
" <td>...</td>\n", | |
" <td>3.40</td>\n", | |
" <td>10.0</td>\n", | |
" <td>102</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>24</td>\n", | |
" <td>30</td>\n", | |
" <td>13950.0</td>\n", | |
" <td>9.791667</td>\n", | |
" <td>7.833333</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>164</td>\n", | |
" <td>audi</td>\n", | |
" <td>gas</td>\n", | |
" <td>std</td>\n", | |
" <td>four</td>\n", | |
" <td>sedan</td>\n", | |
" <td>4wd</td>\n", | |
" <td>front</td>\n", | |
" <td>99.4</td>\n", | |
" <td>...</td>\n", | |
" <td>3.40</td>\n", | |
" <td>8.0</td>\n", | |
" <td>115</td>\n", | |
" <td>5500.0</td>\n", | |
" <td>18</td>\n", | |
" <td>22</td>\n", | |
" <td>17450.0</td>\n", | |
" <td>13.055556</td>\n", | |
" <td>10.681818</td>\n", | |
" <td>Low</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 29 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" symboling normalized-losses make fuel-type aspiration \\\n", | |
"0 3 122 alfa-romero gas std \n", | |
"1 3 122 alfa-romero gas std \n", | |
"2 1 122 alfa-romero gas std \n", | |
"3 2 164 audi gas std \n", | |
"4 2 164 audi gas std \n", | |
"\n", | |
" num-of-doors body-style drive-wheels engine-location wheel-base ... \\\n", | |
"0 two convertible rwd front 88.6 ... \n", | |
"1 two convertible rwd front 88.6 ... \n", | |
"2 two hatchback rwd front 94.5 ... \n", | |
"3 four sedan fwd front 99.8 ... \n", | |
"4 four sedan 4wd front 99.4 ... \n", | |
"\n", | |
" stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg \\\n", | |
"0 2.68 9.0 111 5000.0 21 27 \n", | |
"1 2.68 9.0 111 5000.0 21 27 \n", | |
"2 3.47 9.0 154 5000.0 19 26 \n", | |
"3 3.40 10.0 102 5500.0 24 30 \n", | |
"4 3.40 8.0 115 5500.0 18 22 \n", | |
"\n", | |
" price city-L/100km highway-L/100km horsepower-binned \n", | |
"0 13495.0 11.190476 8.703704 Low \n", | |
"1 16500.0 11.190476 8.703704 Low \n", | |
"2 16500.0 12.368421 9.038462 Medium \n", | |
"3 13950.0 9.791667 7.833333 Low \n", | |
"4 17450.0 13.055556 10.681818 Low \n", | |
"\n", | |
"[5 rows x 29 columns]" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#We now have the value 0 to represent \"gas\" and 1 to represent \"diesel\" in the column \"fuel-type\".\n", | |
"#We will now insert this column back into our original dataset.\n", | |
"\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# sav new csv file \n", | |
"df.to_csv('clean_df.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python", | |
"language": "python", | |
"name": "conda-env-python-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment