kshirsagarsiddharth · December 4, 2019 19:15
diff --git a/sikitlearn_preprocessing.ipynb b/sikitlearn_preprocessing.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer #used for handeling missing data\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder #used for encoding categorical data\n",
    "from sklearn.model_selection import train_test_split#used for splitting training and testing data\n",
    "from sklearn.preprocessing import StandardScaler #used for feature scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Region</th>\n",
       "      <th>Age</th>\n",
       "      <th>Income</th>\n",
       "      <th>Online Shopper</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>India</td>\n",
       "      <td>49.0</td>\n",
       "      <td>86400.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>32.0</td>\n",
       "      <td>57600.0</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>USA</td>\n",
       "      <td>35.0</td>\n",
       "      <td>64800.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>43.0</td>\n",
       "      <td>73200.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>USA</td>\n",
       "      <td>45.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>India</td>\n",
       "      <td>40.0</td>\n",
       "      <td>69600.0</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62400.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>India</td>\n",
       "      <td>53.0</td>\n",
       "      <td>94800.0</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>USA</td>\n",
       "      <td>55.0</td>\n",
       "      <td>99600.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>India</td>\n",
       "      <td>42.0</td>\n",
       "      <td>80400.0</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Region   Age   Income Online Shopper\n",
       "0   India  49.0  86400.0             No\n",
       "1  Brazil  32.0  57600.0            Yes\n",
       "2     USA  35.0  64800.0             No\n",
       "3  Brazil  43.0  73200.0             No\n",
       "4     USA  45.0      NaN            Yes\n",
       "5   India  40.0  69600.0            Yes\n",
       "6  Brazil   NaN  62400.0             No\n",
       "7   India  53.0  94800.0            Yes\n",
       "8     USA  55.0  99600.0             No\n",
       "9   India  42.0  80400.0            Yes"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
    "dataset\n",
    "dataset2 = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
    "dataset2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>India</td>\n",
       "      <td>49</td>\n",
       "      <td>86400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>32</td>\n",
       "      <td>57600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>USA</td>\n",
       "      <td>35</td>\n",
       "      <td>64800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>43</td>\n",
       "      <td>73200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>USA</td>\n",
       "      <td>45</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>India</td>\n",
       "      <td>40</td>\n",
       "      <td>69600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Brazil</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>India</td>\n",
       "      <td>53</td>\n",
       "      <td>94800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>USA</td>\n",
       "      <td>55</td>\n",
       "      <td>99600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>India</td>\n",
       "      <td>42</td>\n",
       "      <td>80400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0    1      2\n",
       "0   India   49  86400\n",
       "1  Brazil   32  57600\n",
       "2     USA   35  64800\n",
       "3  Brazil   43  73200\n",
       "4     USA   45    NaN\n",
       "5   India   40  69600\n",
       "6  Brazil  NaN  62400\n",
       "7   India   53  94800\n",
       "8     USA   55  99600\n",
       "9   India   42  80400"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#splitting the attributes into independent and dependent variables\n",
    "X = dataset.loc[:,'Region':'Income'].values #independent variables\n",
    "Y = dataset.loc[:,'Online Shopper'].values #dependent variables\n",
    "df23 = pd.DataFrame(X)\n",
    "df23"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#handeling the missing data and replace missing value with nan from numpy and replace with mean of all the other values\n",
    "imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')\n",
    "imputer = imputer.fit(X[:,1:])\n",
    "X[:,1:] = imputer.transform(X[:,1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
      "  warnings.warn(msg, FutureWarning)\n",
      "/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:390: DeprecationWarning: The 'categorical_features' keyword is deprecated in version 0.20 and will be removed in 0.22. You can use the ColumnTransformer instead.\n",
      "  \"use the ColumnTransformer instead.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "#encode categorical data\n",
    "#in this we can observe that first column in training dataset is \n",
    "#is categorical data hence we convert this into numerical  value\n",
    "#label encoder converts the catogerical data into numbers 0,1,2 but the algorithm will confuse \n",
    "#hence we use one hot encoder to encode this data this converts this data into diffrent columns\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "labelencoder_X = LabelEncoder()\n",
    "X[:,0] = labelencoder_X.fit_transform(X[:,0])# usa india brazil into numerical labels\n",
    "onehotencoder = OneHotEncoder(categorical_features=[0])\n",
    "X = onehotencoder.fit_transform(X).toarray()\n",
    "\n",
    "labelencoder_Y = LabelEncoder()\n",
    "Y = labelencoder_Y.fit_transform(Y)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#splitting the data into train and test set\n",
    "X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'Standardscaler' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-e9765f50f06a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msc_X\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStandardscaler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mX_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mX_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'Standardscaler' is not defined"
     ]
    }
   ],
   "source": [
    "\"\"\"As you can see we have these two columns age and income that contains numerical numbers. You notice that the variables are not on the same scale because the age are going\n",
    "from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n",
    "\n",
    "sc_X = Standardscaler()\n",
    "X_train = sc_X.fit_transform(X_train)\n",
    "X_test = sc_X.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python",
   "language": "python",
   "name": "conda-env-python-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.impute import SimpleImputer #used for handeling missing data\n",
	"from sklearn.preprocessing import LabelEncoder,OneHotEncoder #used for encoding categorical data\n",
	"from sklearn.model_selection import train_test_split#used for splitting training and testing data\n",
	"from sklearn.preprocessing import StandardScaler #used for feature scaling"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Region</th>\n",
	" <th>Age</th>\n",
	" <th>Income</th>\n",
	" <th>Online Shopper</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>India</td>\n",
	" <td>49.0</td>\n",
	" <td>86400.0</td>\n",
	" <td>No</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Brazil</td>\n",
	" <td>32.0</td>\n",
	" <td>57600.0</td>\n",
	" <td>Yes</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>USA</td>\n",
	" <td>35.0</td>\n",
	" <td>64800.0</td>\n",
	" <td>No</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Brazil</td>\n",
	" <td>43.0</td>\n",
	" <td>73200.0</td>\n",
	" <td>No</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>USA</td>\n",
	" <td>45.0</td>\n",
	" <td>NaN</td>\n",
	" <td>Yes</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>India</td>\n",
	" <td>40.0</td>\n",
	" <td>69600.0</td>\n",
	" <td>Yes</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>Brazil</td>\n",
	" <td>NaN</td>\n",
	" <td>62400.0</td>\n",
	" <td>No</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>India</td>\n",
	" <td>53.0</td>\n",
	" <td>94800.0</td>\n",
	" <td>Yes</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>USA</td>\n",
	" <td>55.0</td>\n",
	" <td>99600.0</td>\n",
	" <td>No</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>India</td>\n",
	" <td>42.0</td>\n",
	" <td>80400.0</td>\n",
	" <td>Yes</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Region Age Income Online Shopper\n",
	"0 India 49.0 86400.0 No\n",
	"1 Brazil 32.0 57600.0 Yes\n",
	"2 USA 35.0 64800.0 No\n",
	"3 Brazil 43.0 73200.0 No\n",
	"4 USA 45.0 NaN Yes\n",
	"5 India 40.0 69600.0 Yes\n",
	"6 Brazil NaN 62400.0 No\n",
	"7 India 53.0 94800.0 Yes\n",
	"8 USA 55.0 99600.0 No\n",
	"9 India 42.0 80400.0 Yes"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dataset = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
	"dataset\n",
	"dataset2 = pd.read_csv(\"https://raw.githubusercontent.com/tarunlnmiit/machine_learning/master/DataPreprocessing.csv\")\n",
	"dataset2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>India</td>\n",
	" <td>49</td>\n",
	" <td>86400</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>Brazil</td>\n",
	" <td>32</td>\n",
	" <td>57600</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>USA</td>\n",
	" <td>35</td>\n",
	" <td>64800</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>Brazil</td>\n",
	" <td>43</td>\n",
	" <td>73200</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>USA</td>\n",
	" <td>45</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>India</td>\n",
	" <td>40</td>\n",
	" <td>69600</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>Brazil</td>\n",
	" <td>NaN</td>\n",
	" <td>62400</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>India</td>\n",
	" <td>53</td>\n",
	" <td>94800</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>USA</td>\n",
	" <td>55</td>\n",
	" <td>99600</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>India</td>\n",
	" <td>42</td>\n",
	" <td>80400</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2\n",
	"0 India 49 86400\n",
	"1 Brazil 32 57600\n",
	"2 USA 35 64800\n",
	"3 Brazil 43 73200\n",
	"4 USA 45 NaN\n",
	"5 India 40 69600\n",
	"6 Brazil NaN 62400\n",
	"7 India 53 94800\n",
	"8 USA 55 99600\n",
	"9 India 42 80400"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#splitting the attributes into independent and dependent variables\n",
	"X = dataset.loc[:,'Region':'Income'].values #independent variables\n",
	"Y = dataset.loc[:,'Online Shopper'].values #dependent variables\n",
	"df23 = pd.DataFrame(X)\n",
	"df23"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"#handeling the missing data and replace missing value with nan from numpy and replace with mean of all the other values\n",
	"imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')\n",
	"imputer = imputer.fit(X[:,1:])\n",
	"X[:,1:] = imputer.transform(X[:,1:])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
	"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
	"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
	" warnings.warn(msg, FutureWarning)\n",
	"/home/jupyterlab/conda/envs/python/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py:390: DeprecationWarning: The 'categorical_features' keyword is deprecated in version 0.20 and will be removed in 0.22. You can use the ColumnTransformer instead.\n",
	" \"use the ColumnTransformer instead.\", DeprecationWarning)\n"
	]
	}
	],
	"source": [
	"#encode categorical data\n",
	"#in this we can observe that first column in training dataset is \n",
	"#is categorical data hence we convert this into numerical value\n",
	"#label encoder converts the catogerical data into numbers 0,1,2 but the algorithm will confuse \n",
	"#hence we use one hot encoder to encode this data this converts this data into diffrent columns\n",
	"from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
	"labelencoder_X = LabelEncoder()\n",
	"X[:,0] = labelencoder_X.fit_transform(X[:,0])# usa india brazil into numerical labels\n",
	"onehotencoder = OneHotEncoder(categorical_features=[0])\n",
	"X = onehotencoder.fit_transform(X).toarray()\n",
	"\n",
	"labelencoder_Y = LabelEncoder()\n",
	"Y = labelencoder_Y.fit_transform(Y)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"#splitting the data into train and test set\n",
	"X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"ename": "NameError",
	"evalue": "name 'Standardscaler' is not defined",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-9-e9765f50f06a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msc_X\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStandardscaler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mX_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mX_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msc_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mNameError\u001b[0m: name 'Standardscaler' is not defined"
	]
	}
	],
	"source": [
	"\"\"\"As you can see we have these two columns age and income that contains numerical numbers. You notice that the variables are not on the same scale because the age are going\n",
	"from 32 to 55 and the salaries going from 57.6 K to like 99.6 K.\"\"\"\n",
	"\n",
	"sc_X = Standardscaler()\n",
	"X_train = sc_X.fit_transform(X_train)\n",
	"X_test = sc_X.transform(X_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python",
	"language": "python",
	"name": "conda-env-python-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
No results found