Last active
January 6, 2023 08:58
-
-
Save erap129/7a9042f903dc0e65a6b1046789cc7682 to your computer and use it in GitHub Desktop.
Vehicles Chaining Demonstration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Goal - Clean data for creation of miles_per_gallon predictor" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"outputs": [], | |
"source": [ | |
"# download dataset here:\n", | |
"# https://gist.github.com/erap129/a1f0fd46a4caf59d4ed470aeb4032f55/archive/42b1298300e7ca44f6ebbea6158e749aa24d03a2.zip\n", | |
"# original dataset URL:\n", | |
"# https://www.fueleconomy.gov/feg/download.shtml\n", | |
"autos = pd.read_csv('vehicles_mini.csv')" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 1985 Rear-Wheel Drive \n1 Ferrari Testarossa 1985 Rear-Wheel Drive \n2 Dodge Charger 1985 Front-Wheel Drive \n3 Ford Focus Electric 2016 Front-Wheel Drive \n4 Tesla Model X AWD - 90D 2016 All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 1993 Front-Wheel Drive \n41140 Subaru Legacy 1993 Front-Wheel Drive \n41141 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon \n0 Manual 5-spd 4.0 Regular 19 \n1 Manual 5-spd 12.0 Regular 9 \n2 Manual 5-spd 4.0 Regular 23 \n3 Automatic (A1) NaN Electricity 110 \n4 Automatic (A1) NaN Electricity 90 \n... ... ... ... ... \n41139 Automatic 4-spd 4.0 Regular 19 \n41140 Manual 5-spd 4.0 Regular 20 \n41141 Automatic 4-spd 4.0 Regular 18 \n41142 Manual 5-spd 4.0 Regular 18 \n41143 Automatic 4-spd 4.0 Premium 16 \n\n[41144 rows x 8 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>12.0</td>\n <td>Regular</td>\n <td>9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>1985</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>23</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Ford</td>\n <td>Focus Electric</td>\n <td>2016</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic (A1)</td>\n <td>NaN</td>\n <td>Electricity</td>\n <td>110</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Tesla</td>\n <td>Model X AWD - 90D</td>\n <td>2016</td>\n <td>All-Wheel Drive</td>\n <td>Automatic (A1)</td>\n <td>NaN</td>\n <td>Electricity</td>\n <td>90</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>20</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>16</td>\n </tr>\n </tbody>\n</table>\n<p>41144 rows × 8 columns</p>\n</div>" | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"autos" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Step 1. Remove Electric cars, they are irrelevant" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 1985 Rear-Wheel Drive \n1 Ferrari Testarossa 1985 Rear-Wheel Drive \n2 Dodge Charger 1985 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 1985 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 1993 Front-Wheel Drive \n41140 Subaru Legacy 1993 Front-Wheel Drive \n41141 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon \n0 Manual 5-spd 4.0 Regular 19 \n1 Manual 5-spd 12.0 Regular 9 \n2 Manual 5-spd 4.0 Regular 23 \n5 Automatic 3-spd 8.0 Regular 10 \n6 Manual 5-spd 4.0 Premium 17 \n... ... ... ... ... \n41139 Automatic 4-spd 4.0 Regular 19 \n41140 Manual 5-spd 4.0 Regular 20 \n41141 Automatic 4-spd 4.0 Regular 18 \n41142 Manual 5-spd 4.0 Regular 18 \n41143 Automatic 4-spd 4.0 Premium 16 \n\n[40941 rows x 8 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>12.0</td>\n <td>Regular</td>\n <td>9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>1985</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>23</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>8.0</td>\n <td>Regular</td>\n <td>10</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>17</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>20</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>16</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 8 columns</p>\n</div>" | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# \"Traditional\" pandas\n", | |
"autos_no_electric = autos[autos['fuelType'] != \"Electricity\"]\n", | |
"autos_no_electric" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 1985 Rear-Wheel Drive \n1 Ferrari Testarossa 1985 Rear-Wheel Drive \n2 Dodge Charger 1985 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 1985 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 1993 Front-Wheel Drive \n41140 Subaru Legacy 1993 Front-Wheel Drive \n41141 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon \n0 Manual 5-spd 4.0 Regular 19 \n1 Manual 5-spd 12.0 Regular 9 \n2 Manual 5-spd 4.0 Regular 23 \n5 Automatic 3-spd 8.0 Regular 10 \n6 Manual 5-spd 4.0 Premium 17 \n... ... ... ... ... \n41139 Automatic 4-spd 4.0 Regular 19 \n41140 Manual 5-spd 4.0 Regular 20 \n41141 Automatic 4-spd 4.0 Regular 18 \n41142 Manual 5-spd 4.0 Regular 18 \n41143 Automatic 4-spd 4.0 Premium 16 \n\n[40941 rows x 8 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>12.0</td>\n <td>Regular</td>\n <td>9</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>1985</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>23</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>8.0</td>\n <td>Regular</td>\n <td>10</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>17</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>20</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>16</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 8 columns</p>\n</div>" | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Chaining\n", | |
"(autos\n", | |
" .query('fuelType != \"Electricity\"')\n", | |
" )" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Step 2. It seems that \"transmission\" should be split into two features - transmission type, and n_speeds" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2045024650.py:5: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['speeds'] = speeds\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2045024650.py:6: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['transmission_type'] = transmission_type\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 1985 Rear-Wheel Drive \n1 Ferrari Testarossa 1985 Rear-Wheel Drive \n2 Dodge Charger 1985 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 1985 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 1993 Front-Wheel Drive \n41140 Subaru Legacy 1993 Front-Wheel Drive \n41141 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon speeds \\\n0 Manual 5-spd 4.0 Regular 19 5 \n1 Manual 5-spd 12.0 Regular 9 5 \n2 Manual 5-spd 4.0 Regular 23 5 \n5 Automatic 3-spd 8.0 Regular 10 3 \n6 Manual 5-spd 4.0 Premium 17 5 \n... ... ... ... ... ... \n41139 Automatic 4-spd 4.0 Regular 19 4 \n41140 Manual 5-spd 4.0 Regular 20 5 \n41141 Automatic 4-spd 4.0 Regular 18 4 \n41142 Manual 5-spd 4.0 Regular 18 5 \n41143 Automatic 4-spd 4.0 Premium 16 4 \n\n transmission_type \n0 Manual \n1 Manual \n2 Manual \n5 Automatic \n6 Manual \n... ... \n41139 Automatic \n41140 Manual \n41141 Automatic \n41142 Manual \n41143 Automatic \n\n[40941 rows x 10 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>12.0</td>\n <td>Regular</td>\n <td>9</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>1985</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>23</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>8.0</td>\n <td>Regular</td>\n <td>10</td>\n <td>3</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>17</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>20</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>16</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 10 columns</p>\n</div>" | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# \"Traditional\" pandas\n", | |
"autos_no_electric = autos[autos['fuelType'] != \"Electricity\"]\n", | |
"speeds = autos.transmission.str.extract(r'(\\d)+')\n", | |
"transmission_type = autos.transmission.str.split(' ').str[0]\n", | |
"autos_no_electric['speeds'] = speeds\n", | |
"autos_no_electric['transmission_type'] = transmission_type\n", | |
"autos_no_electric" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 1985 Rear-Wheel Drive \n1 Ferrari Testarossa 1985 Rear-Wheel Drive \n2 Dodge Charger 1985 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 1985 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 1993 Front-Wheel Drive \n41140 Subaru Legacy 1993 Front-Wheel Drive \n41141 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 1993 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 1993 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon speeds \\\n0 Manual 5-spd 4.0 Regular 19 5 \n1 Manual 5-spd 12.0 Regular 9 5 \n2 Manual 5-spd 4.0 Regular 23 5 \n5 Automatic 3-spd 8.0 Regular 10 3 \n6 Manual 5-spd 4.0 Premium 17 5 \n... ... ... ... ... ... \n41139 Automatic 4-spd 4.0 Regular 19 4 \n41140 Manual 5-spd 4.0 Regular 20 5 \n41141 Automatic 4-spd 4.0 Regular 18 4 \n41142 Manual 5-spd 4.0 Regular 18 5 \n41143 Automatic 4-spd 4.0 Premium 16 4 \n\n transmission_type \n0 Manual \n1 Manual \n2 Manual \n5 Automatic \n6 Manual \n... ... \n41139 Automatic \n41140 Manual \n41141 Automatic \n41142 Manual \n41143 Automatic \n\n[40941 rows x 10 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>12.0</td>\n <td>Regular</td>\n <td>9</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>1985</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>23</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>1985</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>8.0</td>\n <td>Regular</td>\n <td>10</td>\n <td>3</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>17</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>19</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>1993</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>20</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>4.0</td>\n <td>Regular</td>\n <td>18</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>1993</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>4.0</td>\n <td>Premium</td>\n <td>16</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 10 columns</p>\n</div>" | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Chaining\n", | |
"(autos\n", | |
" .query('fuelType != \"Electricity\"')\n", | |
" .assign(speeds=autos.transmission.str.extract(r'(\\d)+'),\n", | |
" transmission_type=autos.transmission.str.split(' ').str[0])\n", | |
" )" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Step 3. Our numeric columns have different scales, let's normalize them!" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2453856917.py:5: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['speeds'] = speeds\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2453856917.py:6: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['transmission_type'] = transmission_type\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2453856917.py:10: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 0.982673 Rear-Wheel Drive \n1 Ferrari Testarossa 0.982673 Rear-Wheel Drive \n2 Dodge Charger 0.982673 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 0.982673 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 0.986634 Front-Wheel Drive \n41140 Subaru Legacy 0.986634 Front-Wheel Drive \n41141 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon speeds \\\n0 Manual 5-spd 0.25 Regular 0.327586 5 \n1 Manual 5-spd 0.75 Regular 0.155172 5 \n2 Manual 5-spd 0.25 Regular 0.396552 5 \n5 Automatic 3-spd 0.50 Regular 0.172414 3 \n6 Manual 5-spd 0.25 Premium 0.293103 5 \n... ... ... ... ... ... \n41139 Automatic 4-spd 0.25 Regular 0.327586 4 \n41140 Manual 5-spd 0.25 Regular 0.344828 5 \n41141 Automatic 4-spd 0.25 Regular 0.310345 4 \n41142 Manual 5-spd 0.25 Regular 0.310345 5 \n41143 Automatic 4-spd 0.25 Premium 0.275862 4 \n\n transmission_type \n0 Manual \n1 Manual \n2 Manual \n5 Automatic \n6 Manual \n... ... \n41139 Automatic \n41140 Manual \n41141 Automatic \n41142 Manual \n41143 Automatic \n\n[40941 rows x 10 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.327586</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.75</td>\n <td>Regular</td>\n <td>0.155172</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>0.982673</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.396552</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>0.50</td>\n <td>Regular</td>\n <td>0.172414</td>\n <td>3</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.293103</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.327586</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.344828</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.310345</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.310345</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.275862</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 10 columns</p>\n</div>" | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# \"Traditional\" pandas\n", | |
"autos_no_electric = autos[autos['fuelType'] != \"Electricity\"]\n", | |
"speeds = autos.transmission.str.extract(r'(\\d)+')\n", | |
"transmission_type = autos.transmission.str.split(' ').str[0]\n", | |
"autos_no_electric['speeds'] = speeds\n", | |
"autos_no_electric['transmission_type'] = transmission_type\n", | |
"numeric_columns = autos_no_electric.select_dtypes(include=np.number)\n", | |
"scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
"for col_name in scaled_numeric_columns.columns:\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n", | |
"autos_no_electric" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 0.982673 Rear-Wheel Drive \n1 Ferrari Testarossa 0.982673 Rear-Wheel Drive \n2 Dodge Charger 0.982673 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 0.982673 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 0.986634 Front-Wheel Drive \n41140 Subaru Legacy 0.986634 Front-Wheel Drive \n41141 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n\n transmission cylinders fuelType miles_per_gallon speeds \\\n0 Manual 5-spd 0.25 Regular 0.327586 0.555556 \n1 Manual 5-spd 0.75 Regular 0.155172 0.555556 \n2 Manual 5-spd 0.25 Regular 0.396552 0.555556 \n5 Automatic 3-spd 0.50 Regular 0.172414 0.333333 \n6 Manual 5-spd 0.25 Premium 0.293103 0.555556 \n... ... ... ... ... ... \n41139 Automatic 4-spd 0.25 Regular 0.327586 0.444444 \n41140 Manual 5-spd 0.25 Regular 0.344828 0.555556 \n41141 Automatic 4-spd 0.25 Regular 0.310345 0.444444 \n41142 Manual 5-spd 0.25 Regular 0.310345 0.555556 \n41143 Automatic 4-spd 0.25 Premium 0.275862 0.444444 \n\n transmission_type \n0 Manual \n1 Manual \n2 Manual \n5 Automatic \n6 Manual \n... ... \n41139 Automatic \n41140 Manual \n41141 Automatic \n41142 Manual \n41143 Automatic \n\n[40941 rows x 10 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>transmission</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>miles_per_gallon</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.327586</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.75</td>\n <td>Regular</td>\n <td>0.155172</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>0.982673</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.396552</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>Automatic 3-spd</td>\n <td>0.50</td>\n <td>Regular</td>\n <td>0.172414</td>\n <td>0.333333</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.293103</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.327586</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.344828</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.310345</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Manual 5-spd</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.310345</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>Automatic 4-spd</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.275862</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 10 columns</p>\n</div>" | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Chaining\n", | |
"def scale_numeric_columns(df):\n", | |
" numeric_columns = df.select_dtypes(include=np.number)\n", | |
" scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
" return df.assign(**{col: scaled_numeric_columns[col] for col in scaled_numeric_columns})\n", | |
"\n", | |
"(autos\n", | |
" .query('fuelType != \"Electricity\"')\n", | |
" .assign(speeds=autos.transmission.str.extract(r'(\\d)+').fillna(3).astype(int),\n", | |
" transmission_type=autos.transmission.str.split(' ').str[0])\n", | |
" .pipe(scale_numeric_columns)\n", | |
")\n" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Step 4. Let's drop some columns we won't be needing" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2275487360.py:5: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['speeds'] = speeds\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2275487360.py:6: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['transmission_type'] = transmission_type\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/2275487360.py:10: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 0.982673 Rear-Wheel Drive \n1 Ferrari Testarossa 0.982673 Rear-Wheel Drive \n2 Dodge Charger 0.982673 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 0.982673 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 0.986634 Front-Wheel Drive \n41140 Subaru Legacy 0.986634 Front-Wheel Drive \n41141 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n\n cylinders fuelType speeds transmission_type \n0 0.25 Regular 5 Manual \n1 0.75 Regular 5 Manual \n2 0.25 Regular 5 Manual \n5 0.50 Regular 3 Automatic \n6 0.25 Premium 5 Manual \n... ... ... ... ... \n41139 0.25 Regular 4 Automatic \n41140 0.25 Regular 5 Manual \n41141 0.25 Regular 4 Automatic \n41142 0.25 Regular 5 Manual \n41143 0.25 Premium 4 Automatic \n\n[40941 rows x 8 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.75</td>\n <td>Regular</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>0.982673</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.50</td>\n <td>Regular</td>\n <td>3</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>5</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>4</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 8 columns</p>\n</div>" | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# \"Traditional\" pandas\n", | |
"autos_no_electric = autos[autos['fuelType'] != \"Electricity\"]\n", | |
"speeds = autos.transmission.str.extract(r'(\\d)+')\n", | |
"transmission_type = autos.transmission.str.split(' ').str[0]\n", | |
"autos_no_electric['speeds'] = speeds\n", | |
"autos_no_electric['transmission_type'] = transmission_type\n", | |
"numeric_columns = autos_no_electric.select_dtypes(include=np.number)\n", | |
"scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
"for col_name in scaled_numeric_columns.columns:\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n", | |
"autos_no_electric_dropped = autos_no_electric.drop(\n", | |
" columns=['miles_per_gallon', 'transmission'])\n", | |
"autos_no_electric_dropped" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " make model year drive \\\n0 Alfa Romeo Spider Veloce 2000 0.982673 Rear-Wheel Drive \n1 Ferrari Testarossa 0.982673 Rear-Wheel Drive \n2 Dodge Charger 0.982673 Front-Wheel Drive \n5 Dodge B150/B250 Wagon 2WD 0.982673 Rear-Wheel Drive \n6 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n... ... ... ... ... \n41139 Subaru Legacy 0.986634 Front-Wheel Drive \n41140 Subaru Legacy 0.986634 Front-Wheel Drive \n41141 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41142 Subaru Legacy AWD 0.986634 4-Wheel or All-Wheel Drive \n41143 Subaru Legacy AWD Turbo 0.986634 4-Wheel or All-Wheel Drive \n\n cylinders fuelType speeds transmission_type \n0 0.25 Regular 0.555556 Manual \n1 0.75 Regular 0.555556 Manual \n2 0.25 Regular 0.555556 Manual \n5 0.50 Regular 0.333333 Automatic \n6 0.25 Premium 0.555556 Manual \n... ... ... ... ... \n41139 0.25 Regular 0.444444 Automatic \n41140 0.25 Regular 0.555556 Manual \n41141 0.25 Regular 0.444444 Automatic \n41142 0.25 Regular 0.555556 Manual \n41143 0.25 Premium 0.444444 Automatic \n\n[40941 rows x 8 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>make</th>\n <th>model</th>\n <th>year</th>\n <th>drive</th>\n <th>cylinders</th>\n <th>fuelType</th>\n <th>speeds</th>\n <th>transmission_type</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alfa Romeo</td>\n <td>Spider Veloce 2000</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Ferrari</td>\n <td>Testarossa</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.75</td>\n <td>Regular</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Dodge</td>\n <td>Charger</td>\n <td>0.982673</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Dodge</td>\n <td>B150/B250 Wagon 2WD</td>\n <td>0.982673</td>\n <td>Rear-Wheel Drive</td>\n <td>0.50</td>\n <td>Regular</td>\n <td>0.333333</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>Subaru</td>\n <td>Legacy</td>\n <td>0.986634</td>\n <td>Front-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>Subaru</td>\n <td>Legacy AWD</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Regular</td>\n <td>0.555556</td>\n <td>Manual</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>Subaru</td>\n <td>Legacy AWD Turbo</td>\n <td>0.986634</td>\n <td>4-Wheel or All-Wheel Drive</td>\n <td>0.25</td>\n <td>Premium</td>\n <td>0.444444</td>\n <td>Automatic</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 8 columns</p>\n</div>" | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Chaining\n", | |
"def scale_numeric_columns(df):\n", | |
" numeric_columns = df.select_dtypes(include=np.number)\n", | |
" scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
" return df.assign(**{col: scaled_numeric_columns[col] for col in scaled_numeric_columns})\n", | |
"\n", | |
"(autos\n", | |
" .query('fuelType != \"Electricity\"')\n", | |
" .assign(speeds=autos.transmission.str.extract(r'(\\d)+').fillna(3).astype(int),\n", | |
" transmission_type=autos.transmission.str.split(' ').str[0])\n", | |
" .pipe(scale_numeric_columns)\n", | |
" .drop(columns=['miles_per_gallon', 'transmission'])\n", | |
")" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"### Step 5. Finally, let's apply one-hot encoding to the categorical variables" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/1577708602.py:5: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['speeds'] = speeds\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/1577708602.py:6: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric['transmission_type'] = transmission_type\n", | |
"C:\\Users\\eladrapa\\AppData\\Local\\Temp/ipykernel_9212/1577708602.py:10: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | |
"Try using .loc[row_indexer,col_indexer] = value instead\n", | |
"\n", | |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": " year cylinders speeds make_AM General make_ASC Incorporated \\\n0 0.982673 0.25 5 0 0 \n1 0.982673 0.75 5 0 0 \n2 0.982673 0.25 5 0 0 \n5 0.982673 0.50 3 0 0 \n6 0.986634 0.25 5 0 0 \n... ... ... ... ... ... \n41139 0.986634 0.25 4 0 0 \n41140 0.986634 0.25 5 0 0 \n41141 0.986634 0.25 4 0 0 \n41142 0.986634 0.25 5 0 0 \n41143 0.986634 0.25 4 0 0 \n\n make_Acura make_Alfa Romeo make_American Motors Corporation \\\n0 0 1 0 \n1 0 0 0 \n2 0 0 0 \n5 0 0 0 \n6 0 0 0 \n... ... ... ... \n41139 0 0 0 \n41140 0 0 0 \n41141 0 0 0 \n41142 0 0 0 \n41143 0 0 0 \n\n make_Aston Martin make_Audi ... fuelType_Midgrade fuelType_Premium \\\n0 0 0 ... 0 0 \n1 0 0 ... 0 0 \n2 0 0 ... 0 0 \n5 0 0 ... 0 0 \n6 0 0 ... 0 1 \n... ... ... ... ... ... \n41139 0 0 ... 0 0 \n41140 0 0 ... 0 0 \n41141 0 0 ... 0 0 \n41142 0 0 ... 0 0 \n41143 0 0 ... 0 1 \n\n fuelType_Premium Gas or Electricity fuelType_Premium and Electricity \\\n0 0 0 \n1 0 0 \n2 0 0 \n5 0 0 \n6 0 0 \n... ... ... \n41139 0 0 \n41140 0 0 \n41141 0 0 \n41142 0 0 \n41143 0 0 \n\n fuelType_Premium or E85 fuelType_Regular \\\n0 0 1 \n1 0 1 \n2 0 1 \n5 0 1 \n6 0 0 \n... ... ... \n41139 0 1 \n41140 0 1 \n41141 0 1 \n41142 0 1 \n41143 0 0 \n\n fuelType_Regular Gas and Electricity \\\n0 0 \n1 0 \n2 0 \n5 0 \n6 0 \n... ... \n41139 0 \n41140 0 \n41141 0 \n41142 0 \n41143 0 \n\n fuelType_Regular Gas or Electricity transmission_type_Automatic \\\n0 0 0 \n1 0 0 \n2 0 0 \n5 0 1 \n6 0 0 \n... ... ... \n41139 0 1 \n41140 0 0 \n41141 0 1 \n41142 0 0 \n41143 0 1 \n\n transmission_type_Manual \n0 1 \n1 1 \n2 1 \n5 0 \n6 1 \n... ... \n41139 0 \n41140 1 \n41141 0 \n41142 1 \n41143 0 \n\n[40941 rows x 4125 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>year</th>\n <th>cylinders</th>\n <th>speeds</th>\n <th>make_AM General</th>\n <th>make_ASC Incorporated</th>\n <th>make_Acura</th>\n <th>make_Alfa Romeo</th>\n <th>make_American Motors Corporation</th>\n <th>make_Aston Martin</th>\n <th>make_Audi</th>\n <th>...</th>\n <th>fuelType_Midgrade</th>\n <th>fuelType_Premium</th>\n <th>fuelType_Premium Gas or Electricity</th>\n <th>fuelType_Premium and Electricity</th>\n <th>fuelType_Premium or E85</th>\n <th>fuelType_Regular</th>\n <th>fuelType_Regular Gas and Electricity</th>\n <th>fuelType_Regular Gas or Electricity</th>\n <th>transmission_type_Automatic</th>\n <th>transmission_type_Manual</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.982673</td>\n <td>0.25</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.982673</td>\n <td>0.75</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.982673</td>\n <td>0.25</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>0.982673</td>\n <td>0.50</td>\n <td>3</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>4</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>4</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>5</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>4</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 4125 columns</p>\n</div>" | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# \"Traditional\" pandas\n", | |
"autos_no_electric = autos[autos['fuelType'] != \"Electricity\"]\n", | |
"speeds = autos.transmission.str.extract(r'(\\d)+')\n", | |
"transmission_type = autos.transmission.str.split(' ').str[0]\n", | |
"autos_no_electric['speeds'] = speeds\n", | |
"autos_no_electric['transmission_type'] = transmission_type\n", | |
"numeric_columns = autos_no_electric.select_dtypes(include=np.number)\n", | |
"scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
"for col_name in scaled_numeric_columns.columns:\n", | |
" autos_no_electric[col_name] = scaled_numeric_columns[col_name]\n", | |
"autos_no_electric_dropped = autos_no_electric.drop(\n", | |
" columns=['miles_per_gallon', 'transmission'])\n", | |
"autos_one_hot_encoded = pd.get_dummies(autos_no_electric_dropped,\n", | |
" columns=['make', 'model',\n", | |
" 'drive', 'fuelType',\n", | |
" 'transmission_type'])\n", | |
"autos_one_hot_encoded" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": " year cylinders speeds make_AM General make_ASC Incorporated \\\n0 0.982673 0.25 0.555556 0 0 \n1 0.982673 0.75 0.555556 0 0 \n2 0.982673 0.25 0.555556 0 0 \n5 0.982673 0.50 0.333333 0 0 \n6 0.986634 0.25 0.555556 0 0 \n... ... ... ... ... ... \n41139 0.986634 0.25 0.444444 0 0 \n41140 0.986634 0.25 0.555556 0 0 \n41141 0.986634 0.25 0.444444 0 0 \n41142 0.986634 0.25 0.555556 0 0 \n41143 0.986634 0.25 0.444444 0 0 \n\n make_Acura make_Alfa Romeo make_American Motors Corporation \\\n0 0 1 0 \n1 0 0 0 \n2 0 0 0 \n5 0 0 0 \n6 0 0 0 \n... ... ... ... \n41139 0 0 0 \n41140 0 0 0 \n41141 0 0 0 \n41142 0 0 0 \n41143 0 0 0 \n\n make_Aston Martin make_Audi ... fuelType_Midgrade fuelType_Premium \\\n0 0 0 ... 0 0 \n1 0 0 ... 0 0 \n2 0 0 ... 0 0 \n5 0 0 ... 0 0 \n6 0 0 ... 0 1 \n... ... ... ... ... ... \n41139 0 0 ... 0 0 \n41140 0 0 ... 0 0 \n41141 0 0 ... 0 0 \n41142 0 0 ... 0 0 \n41143 0 0 ... 0 1 \n\n fuelType_Premium Gas or Electricity fuelType_Premium and Electricity \\\n0 0 0 \n1 0 0 \n2 0 0 \n5 0 0 \n6 0 0 \n... ... ... \n41139 0 0 \n41140 0 0 \n41141 0 0 \n41142 0 0 \n41143 0 0 \n\n fuelType_Premium or E85 fuelType_Regular \\\n0 0 1 \n1 0 1 \n2 0 1 \n5 0 1 \n6 0 0 \n... ... ... \n41139 0 1 \n41140 0 1 \n41141 0 1 \n41142 0 1 \n41143 0 0 \n\n fuelType_Regular Gas and Electricity \\\n0 0 \n1 0 \n2 0 \n5 0 \n6 0 \n... ... \n41139 0 \n41140 0 \n41141 0 \n41142 0 \n41143 0 \n\n fuelType_Regular Gas or Electricity transmission_type_Automatic \\\n0 0 0 \n1 0 0 \n2 0 0 \n5 0 1 \n6 0 0 \n... ... ... \n41139 0 1 \n41140 0 0 \n41141 0 1 \n41142 0 0 \n41143 0 1 \n\n transmission_type_Manual \n0 1 \n1 1 \n2 1 \n5 0 \n6 1 \n... ... \n41139 0 \n41140 1 \n41141 0 \n41142 1 \n41143 0 \n\n[40941 rows x 4125 columns]", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>year</th>\n <th>cylinders</th>\n <th>speeds</th>\n <th>make_AM General</th>\n <th>make_ASC Incorporated</th>\n <th>make_Acura</th>\n <th>make_Alfa Romeo</th>\n <th>make_American Motors Corporation</th>\n <th>make_Aston Martin</th>\n <th>make_Audi</th>\n <th>...</th>\n <th>fuelType_Midgrade</th>\n <th>fuelType_Premium</th>\n <th>fuelType_Premium Gas or Electricity</th>\n <th>fuelType_Premium and Electricity</th>\n <th>fuelType_Premium or E85</th>\n <th>fuelType_Regular</th>\n <th>fuelType_Regular Gas and Electricity</th>\n <th>fuelType_Regular Gas or Electricity</th>\n <th>transmission_type_Automatic</th>\n <th>transmission_type_Manual</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.982673</td>\n <td>0.25</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.982673</td>\n <td>0.75</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.982673</td>\n <td>0.25</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5</th>\n <td>0.982673</td>\n <td>0.50</td>\n <td>0.333333</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>41139</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.444444</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>41140</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>41141</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.444444</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>41142</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.555556</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n </tr>\n <tr>\n <th>41143</th>\n <td>0.986634</td>\n <td>0.25</td>\n <td>0.444444</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>40941 rows × 4125 columns</p>\n</div>" | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Chaining\n", | |
"def scale_numeric_columns(df):\n", | |
" numeric_columns = df.select_dtypes(include=np.number)\n", | |
" scaled_numeric_columns = numeric_columns / numeric_columns.max()\n", | |
" return df.assign(**{col: scaled_numeric_columns[col] for col in scaled_numeric_columns})\n", | |
"\n", | |
"(autos\n", | |
" .query('fuelType != \"Electricity\"')\n", | |
" .assign(speeds=autos.transmission.str.extract(r'(\\d)+').fillna(3).astype(int),\n", | |
" transmission_type=autos.transmission.str.split(' ').str[0])\n", | |
" .pipe(scale_numeric_columns)\n", | |
" .drop(columns=['miles_per_gallon', 'transmission'])\n", | |
" .pipe(pd.get_dummies, columns=['make', 'model', 'drive',\n", | |
" 'fuelType', 'transmission_type'])\n", | |
")" | |
], | |
"metadata": { | |
"collapsed": false | |
} | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment