Skip to content

Instantly share code, notes, and snippets.

@kshirsagarsiddharth
Created December 4, 2019 19:15
Show Gist options
  • Select an option

  • Save kshirsagarsiddharth/1a6d5c981ea567dd06bc464a7badbbf9 to your computer and use it in GitHub Desktop.

Select an option

Save kshirsagarsiddharth/1a6d5c981ea567dd06bc464a7badbbf9 to your computer and use it in GitHub Desktop.
Created on Cognitive Class Labs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer #used for handeling missing data\n",
"from sklearn.preprocessing import LabelEncoder,OneHotEncoder #used for encoding categorical data\n",
"from sklearn.model_selection import train_test_split#used for splitting training and testing data\n",
"from sklearn.preprocessing import StandardScaler #used for feature scaling"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Region</th>\n",
" <th>Age</th>\n",
" <th>Income</th>\n",
" <th>Online Shopper</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>India</td>\n",
" <td>49.0</td>\n",
" <td>86400.0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Brazil</td>\n",
" <td>32.0</td>\n",
" <td>57600.0</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>USA</td>\n",
" <td>35.0</td>\n",
" <td>64800.0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Brazil</td>\n",
" <td>43.0</td>\n",
" <td>73200.0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>USA</td>\n",
" <td>45.0</td>\n",
" <td>NaN</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>India</td>\n",
" <td>40.0</td>\n",
" <td>69600.0</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Brazil</td>\n",
" <td>NaN</td>\n",
" <td>62400.0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>India</td>\n",
" <td>53.0</td>\n",
" <td>94800.0</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>USA</td>\n",
" <td>55.0</td>\n",
" <td>99600.0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>India</td>\n",
" <td>42.0</td>\n",
" <td>80400.0</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Region Age Income Online Shopper\n",
"0 India 49.0 86400.0 No\n",
"1 Brazil 32.0 57600.0 Yes\n",
"2 USA 35.0 64800.0 No\n",
"3 Brazil 43.0 73200.0 No\n",
"4 USA 45.0 NaN Yes\n",
"5 India 40.0 69600.0 Yes\n",
"6 Brazil NaN 62400.0 No\n",
"7 India 53.0 94800.0 Yes\n",
"8 USA 55.0 99600.0 No\n",
"9 India 42.0 80400.0 Yes"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
"dataset\n",
"dataset2 = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
"dataset2"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>India</td>\n",
" <td>49</td>\n",
" <td>86400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Brazil</td>\n",
" <td>32</td>\n",
" <td>57600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>USA</td>\n",
" <td>35</td>\n",
" <td>64800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Brazil</td>\n",
" <td>43</td>\n",
" <td>73200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>USA</td>\n",
" <td>45</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>India</td>\n",
" <td>40</td>\n",
" <td>69600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Brazil</td>\n",
" <td>NaN</td>\n",
" <td>62400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>India</td>\n",
" <td>53</td>\n",
" <td>94800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>USA</td>\n",
" <td>55</td>\n",
" <td>99600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>India</td>\n",
" <td>42</td>\n",
" <td>80400</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2\n",
"0 India 49 86400\n",
"1 Brazil 32 57600\n",
"2 USA 35 64800\n",
"3 Brazil 43 73200\n",
"4 USA 45 NaN\n",
"5 India 40 69600\n",
"6 Brazil NaN 62400\n",
"7 India 53 94800\n",
"8 USA 55 99600\n",
"9 India 42 80400"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#splitting the attributes into independent and dependent variables\n",
"X = dataset.loc[:,'Region':'Income'].values #independent variables\n",
"Y = dataset.loc[:,'Online Shopper'].values #dependent variables\n",
"df23 = pd.DataFrame(X)\n",
"df23"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#handeling the missing data and replace missing value with nan from numpy and replace with mean of all the other values\n",
"imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')\n",
"imputer = imputer.fit(X[:,1:])\n",
"X[:,1:] = imputer.transform(X[:,1:])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
" warnings.warn(msg, FutureWarning)\n",
"/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:390: DeprecationWarning: The 'categorical_features' keyword is deprecated in version 0.20 and will be removed in 0.22. You can use the ColumnTransformer instead.\n",
" \"use the ColumnTransformer instead.\", DeprecationWarning)\n"
]
}
],
"source": [
"#encode categorical data\n",
"#in this we can observe that first column in training dataset is \n",
"#is categorical data hence we convert this into numerical value\n",
"#label encoder converts the catogerical data into numbers 0,1,2 but the algorithm will confuse \n",
"#hence we use one hot encoder to encode this data this converts this data into diffrent columns\n",
"from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
"labelencoder_X = LabelEncoder()\n",
"X[:,0] = labelencoder_X.fit_transform(X[:,0])# usa india brazil into numerical labels\n",
"onehotencoder = OneHotEncoder(categorical_features=[0])\n",
"X = onehotencoder.fit_transform(X).toarray()\n",
"\n",
"labelencoder_Y = LabelEncoder()\n",
"Y = labelencoder_Y.fit_transform(Y)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#splitting the data into train and test set\n",
"X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'Standardscaler' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-e9765f50f06a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msc_X\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStandardscaler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mX_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mX_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'Standardscaler' is not defined"
]
}
],
"source": [
"\"\"\"As you can see we have these two columns age and income that contains numerical numbers. You notice that the variables are not on the same scale because the age are going\n",
"from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n",
"\n",
"sc_X = Standardscaler()\n",
"X_train = sc_X.fit_transform(X_train)\n",
"X_test = sc_X.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python",
"language": "python",
"name": "conda-env-python-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment