kitmonisit · May 20, 2018 13:24
diff --git a/50_Startups.csv b/50_Startups.csv
diff --git a/Part 2.5.ipynb b/Part 2.5.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>R&amp;D Spend</th>\n",
       "      <th>Administration</th>\n",
       "      <th>Marketing Spend</th>\n",
       "      <th>State</th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>165349.20</td>\n",
       "      <td>136897.80</td>\n",
       "      <td>471784.10</td>\n",
       "      <td>New York</td>\n",
       "      <td>192261.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>162597.70</td>\n",
       "      <td>151377.59</td>\n",
       "      <td>443898.53</td>\n",
       "      <td>California</td>\n",
       "      <td>191792.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>153441.51</td>\n",
       "      <td>101145.55</td>\n",
       "      <td>407934.54</td>\n",
       "      <td>Florida</td>\n",
       "      <td>191050.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>144372.41</td>\n",
       "      <td>118671.85</td>\n",
       "      <td>383199.62</td>\n",
       "      <td>New York</td>\n",
       "      <td>182901.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>142107.34</td>\n",
       "      <td>91391.77</td>\n",
       "      <td>366168.42</td>\n",
       "      <td>Florida</td>\n",
       "      <td>166187.94</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   R&D Spend  Administration  Marketing Spend       State     Profit\n",
       "0  165349.20       136897.80        471784.10    New York  192261.83\n",
       "1  162597.70       151377.59        443898.53  California  191792.06\n",
       "2  153441.51       101145.55        407934.54     Florida  191050.39\n",
       "3  144372.41       118671.85        383199.62    New York  182901.99\n",
       "4  142107.34        91391.77        366168.42     Florida  166187.94"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('./50_Startups.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>State_Florida</th>\n",
       "      <th>State_New York</th>\n",
       "      <th>R&amp;D Spend</th>\n",
       "      <th>Administration</th>\n",
       "      <th>Marketing Spend</th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>165349.20</td>\n",
       "      <td>136897.80</td>\n",
       "      <td>471784.10</td>\n",
       "      <td>192261.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>162597.70</td>\n",
       "      <td>151377.59</td>\n",
       "      <td>443898.53</td>\n",
       "      <td>191792.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>153441.51</td>\n",
       "      <td>101145.55</td>\n",
       "      <td>407934.54</td>\n",
       "      <td>191050.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>144372.41</td>\n",
       "      <td>118671.85</td>\n",
       "      <td>383199.62</td>\n",
       "      <td>182901.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>142107.34</td>\n",
       "      <td>91391.77</td>\n",
       "      <td>366168.42</td>\n",
       "      <td>166187.94</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   State_Florida  State_New York  R&D Spend  Administration  Marketing Spend  \\\n",
       "0            0.0             1.0  165349.20       136897.80        471784.10   \n",
       "1            0.0             0.0  162597.70       151377.59        443898.53   \n",
       "2            1.0             0.0  153441.51       101145.55        407934.54   \n",
       "3            0.0             1.0  144372.41       118671.85        383199.62   \n",
       "4            1.0             0.0  142107.34        91391.77        366168.42   \n",
       "\n",
       "      Profit  \n",
       "0  192261.83  \n",
       "1  191792.06  \n",
       "2  191050.39  \n",
       "3  182901.99  \n",
       "4  166187.94  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn_pandas import DataFrameMapper\n",
    "from sklearn.preprocessing import LabelBinarizer\n",
    "mapper = DataFrameMapper(\n",
    "        [('State',\n",
    "          LabelBinarizer()\n",
    "         )\n",
    "        ],\n",
    "    default=None,\n",
    "    df_out=True\n",
    "    )\n",
    "df = mapper.fit_transform(df)\n",
    "df = df[df.columns[1:]]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>State_Florida</th>\n",
       "      <th>State_New York</th>\n",
       "      <th>R&amp;D Spend</th>\n",
       "      <th>Administration</th>\n",
       "      <th>Marketing Spend</th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>165349.20</td>\n",
       "      <td>136897.80</td>\n",
       "      <td>471784.10</td>\n",
       "      <td>192261.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>162597.70</td>\n",
       "      <td>151377.59</td>\n",
       "      <td>443898.53</td>\n",
       "      <td>191792.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>144372.41</td>\n",
       "      <td>118671.85</td>\n",
       "      <td>383199.62</td>\n",
       "      <td>182901.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>131876.90</td>\n",
       "      <td>99814.71</td>\n",
       "      <td>362861.36</td>\n",
       "      <td>156991.12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>134615.46</td>\n",
       "      <td>147198.87</td>\n",
       "      <td>127716.82</td>\n",
       "      <td>156122.51</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   State_Florida  State_New York  R&D Spend  Administration  Marketing Spend  \\\n",
       "0            0.0             1.0  165349.20       136897.80        471784.10   \n",
       "1            0.0             0.0  162597.70       151377.59        443898.53   \n",
       "3            0.0             1.0  144372.41       118671.85        383199.62   \n",
       "5            0.0             1.0  131876.90        99814.71        362861.36   \n",
       "6            0.0             0.0  134615.46       147198.87        127716.82   \n",
       "\n",
       "      Profit  \n",
       "0  192261.83  \n",
       "1  191792.06  \n",
       "3  182901.99  \n",
       "5  156991.12  \n",
       "6  156122.51  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "idx_train, idx_test = train_test_split(\n",
    "    df.index,\n",
    "    test_size=0.2,\n",
    "    random_state=0)\n",
    "df_train = df.iloc[idx_train].sort_index()\n",
    "df_test = df.iloc[idx_test].sort_index()\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kit/UNIX/virtualenvs/engg/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n",
      "  linalg.lstsq(X, y)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "regressor = LinearRegression()\n",
    "regressor.fit(\n",
    "    df_train[df.columns[:-1]].values,\n",
    "    df_train[[df.columns[-1]]].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[178537.48221054],\n",
       "       [167921.0656955 ],\n",
       "       [132447.73845175],\n",
       "       [132582.27760815],\n",
       "       [113969.43533012],\n",
       "       [116161.24230165],\n",
       "       [103015.20159797],\n",
       "       [ 98791.73374688],\n",
       "       [ 67851.69209676],\n",
       "       [ 71976.09851258]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = regressor.predict(\n",
    "    df_test[df.columns[:-1]].values)\n",
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>191050.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166187.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>146121.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>144259.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>110352.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>105008.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>103282.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>97483.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>81229.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>77798.83</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Profit\n",
       "2   191050.39\n",
       "4   166187.94\n",
       "10  146121.95\n",
       "11  144259.40\n",
       "22  110352.25\n",
       "27  105008.31\n",
       "28  103282.38\n",
       "31   97483.56\n",
       "38   81229.06\n",
       "41   77798.83"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test[['Profit']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Ones</th>\n",
       "      <th>State_Florida</th>\n",
       "      <th>State_New York</th>\n",
       "      <th>R&amp;D Spend</th>\n",
       "      <th>Administration</th>\n",
       "      <th>Marketing Spend</th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>165349.20</td>\n",
       "      <td>136897.80</td>\n",
       "      <td>471784.10</td>\n",
       "      <td>192261.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>162597.70</td>\n",
       "      <td>151377.59</td>\n",
       "      <td>443898.53</td>\n",
       "      <td>191792.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>153441.51</td>\n",
       "      <td>101145.55</td>\n",
       "      <td>407934.54</td>\n",
       "      <td>191050.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>144372.41</td>\n",
       "      <td>118671.85</td>\n",
       "      <td>383199.62</td>\n",
       "      <td>182901.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>142107.34</td>\n",
       "      <td>91391.77</td>\n",
       "      <td>366168.42</td>\n",
       "      <td>166187.94</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Ones  State_Florida  State_New York  R&D Spend  Administration  \\\n",
       "0     1            0.0             1.0  165349.20       136897.80   \n",
       "1     1            0.0             0.0  162597.70       151377.59   \n",
       "2     1            1.0             0.0  153441.51       101145.55   \n",
       "3     1            0.0             1.0  144372.41       118671.85   \n",
       "4     1            1.0             0.0  142107.34        91391.77   \n",
       "\n",
       "   Marketing Spend     Profit  \n",
       "0        471784.10  192261.83  \n",
       "1        443898.53  191792.06  \n",
       "2        407934.54  191050.39  \n",
       "3        383199.62  182901.99  \n",
       "4        366168.42  166187.94  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import statsmodels.formula.api as sm\n",
    "df.insert(0, 'Ones', 1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.947\n",
      "Model:                            OLS   Adj. R-squared:                  0.945\n",
      "Method:                 Least Squares   F-statistic:                     849.8\n",
      "Date:                Sun, 20 May 2018   Prob (F-statistic):           3.50e-32\n",
      "Time:                        21:12:28   Log-Likelihood:                -527.44\n",
      "No. Observations:                  50   AIC:                             1059.\n",
      "Df Residuals:                      48   BIC:                             1063.\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Ones        4.903e+04   2537.897     19.320      0.000    4.39e+04    5.41e+04\n",
      "R&D Spend      0.8543      0.029     29.151      0.000       0.795       0.913\n",
      "==============================================================================\n",
      "Omnibus:                       13.727   Durbin-Watson:                   1.116\n",
      "Prob(Omnibus):                  0.001   Jarque-Bera (JB):               18.536\n",
      "Skew:                          -0.911   Prob(JB):                     9.44e-05\n",
      "Kurtosis:                       5.361   Cond. No.                     1.65e+05\n",
      "==============================================================================\n",
      "\n",
      "Warnings:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
      "[2] The condition number is large, 1.65e+05. This might indicate that there are\n",
      "strong multicollinearity or other numerical problems.\n"
     ]
    }
   ],
   "source": [
    "def backwardElimination(df, SL):\n",
    "    df_opt = df.copy()\n",
    "    inVars = df.columns.tolist()[:-2]\n",
    "    numVars = len(inVars)\n",
    "    while True:\n",
    "        regressor = sm.OLS(\n",
    "            endog=df_opt[['Profit']].values,\n",
    "            exog=df_opt[inVars]).fit()\n",
    "        if regressor.pvalues.max() < SL:\n",
    "            break\n",
    "        inVars.remove(regressor.pvalues.idxmax())\n",
    "    cols = inVars\n",
    "    cols.append(df.columns[-1])\n",
    "    print(regressor.summary())\n",
    "    return df_opt[cols]\n",
    "\n",
    "df_opt = backwardElimination(df, 0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[104667.27805998],\n",
       "       [134150.83410578],\n",
       "       [135207.80019517],\n",
       "       [ 72170.54428856],\n",
       "       [179090.58602508],\n",
       "       [109824.77386586],\n",
       "       [ 65644.27773757],\n",
       "       [100481.43277139],\n",
       "       [111431.75202432],\n",
       "       [169438.14843539]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx_train, idx_test = train_test_split(\n",
    "    df_opt.index,\n",
    "    test_size=0.2,\n",
    "    random_state=0)\n",
    "\n",
    "df_train = df_opt.iloc[idx_train]\n",
    "df_test = df_opt.iloc[idx_test]\n",
    "\n",
    "regressor = LinearRegression()\n",
    "regressor.fit(\n",
    "    df_train[df_train.columns[:-1]].values,\n",
    "    df_train[df_train.columns[-1:]].values)\n",
    "\n",
    "regressor.predict(\n",
    "    df_test[df_test.columns[:-1]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Profit</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>103282.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>144259.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>146121.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>77798.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>191050.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>105008.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>81229.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>97483.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>110352.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166187.94</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Profit\n",
       "28  103282.38\n",
       "11  144259.40\n",
       "10  146121.95\n",
       "41   77798.83\n",
       "2   191050.39\n",
       "27  105008.31\n",
       "38   81229.06\n",
       "31   97483.56\n",
       "22  110352.25\n",
       "4   166187.94"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test[df_test.columns[-1:]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
R&D Spend	Administration	Marketing Spend	State	Profit
165349.2	136897.8	471784.1	New York	192261.83
162597.7	151377.59	443898.53	California	191792.06
153441.51	101145.55	407934.54	Florida	191050.39
144372.41	118671.85	383199.62	New York	182901.99
142107.34	91391.77	366168.42	Florida	166187.94
131876.9	99814.71	362861.36	New York	156991.12
134615.46	147198.87	127716.82	California	156122.51
130298.13	145530.06	323876.68	Florida	155752.6
120542.52	148718.95	311613.29	New York	152211.77
123334.88	108679.17	304981.62	California	149759.96
101913.08	110594.11	229160.95	Florida	146121.95
100671.96	91790.61	249744.55	California	144259.4
93863.75	127320.38	249839.44	Florida	141585.52
91992.39	135495.07	252664.93	California	134307.35
119943.24	156547.42	256512.92	Florida	132602.65
114523.61	122616.84	261776.23	New York	129917.04
78013.11	121597.55	264346.06	California	126992.93
94657.16	145077.58	282574.31	New York	125370.37
91749.16	114175.79	294919.57	Florida	124266.9
86419.7	153514.11	0	New York	122776.86
76253.86	113867.3	298664.47	California	118474.03
78389.47	153773.43	299737.29	New York	111313.02
73994.56	122782.75	303319.26	Florida	110352.25
67532.53	105751.03	304768.73	Florida	108733.99
77044.01	99281.34	140574.81	New York	108552.04
64664.71	139553.16	137962.62	California	107404.34
75328.87	144135.98	134050.07	Florida	105733.54
72107.6	127864.55	353183.81	New York	105008.31
66051.52	182645.56	118148.2	Florida	103282.38
65605.48	153032.06	107138.38	New York	101004.64
61994.48	115641.28	91131.24	Florida	99937.59
61136.38	152701.92	88218.23	New York	97483.56
63408.86	129219.61	46085.25	California	97427.84
55493.95	103057.49	214634.81	Florida	96778.92
46426.07	157693.92	210797.67	California	96712.8
46014.02	85047.44	205517.64	New York	96479.51
28663.76	127056.21	201126.82	Florida	90708.19
44069.95	51283.14	197029.42	California	89949.14
20229.59	65947.93	185265.1	New York	81229.06
38558.51	82982.09	174999.3	California	81005.76
28754.33	118546.05	172795.67	California	78239.91
27892.92	84710.77	164470.71	Florida	77798.83
23640.93	96189.63	148001.11	California	71498.49
15505.73	127382.3	35534.17	New York	69758.98
22177.74	154806.14	28334.72	California	65200.33
1000.23	124153.04	1903.93	New York	64926.08
1315.46	115816.21	297114.46	Florida	49490.75
0	135426.92	0	California	42559.73
542.05	51743.15	0	New York	35673.41
0	116983.8	45173.06	California	14681.4
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import pickle"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>R&D Spend</th>\n",
	" <th>Administration</th>\n",
	" <th>Marketing Spend</th>\n",
	" <th>State</th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>165349.20</td>\n",
	" <td>136897.80</td>\n",
	" <td>471784.10</td>\n",
	" <td>New York</td>\n",
	" <td>192261.83</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>162597.70</td>\n",
	" <td>151377.59</td>\n",
	" <td>443898.53</td>\n",
	" <td>California</td>\n",
	" <td>191792.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>153441.51</td>\n",
	" <td>101145.55</td>\n",
	" <td>407934.54</td>\n",
	" <td>Florida</td>\n",
	" <td>191050.39</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>144372.41</td>\n",
	" <td>118671.85</td>\n",
	" <td>383199.62</td>\n",
	" <td>New York</td>\n",
	" <td>182901.99</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>142107.34</td>\n",
	" <td>91391.77</td>\n",
	" <td>366168.42</td>\n",
	" <td>Florida</td>\n",
	" <td>166187.94</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" R&D Spend Administration Marketing Spend State Profit\n",
	"0 165349.20 136897.80 471784.10 New York 192261.83\n",
	"1 162597.70 151377.59 443898.53 California 191792.06\n",
	"2 153441.51 101145.55 407934.54 Florida 191050.39\n",
	"3 144372.41 118671.85 383199.62 New York 182901.99\n",
	"4 142107.34 91391.77 366168.42 Florida 166187.94"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.read_csv('./50_Startups.csv')\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>State_Florida</th>\n",
	" <th>State_New York</th>\n",
	" <th>R&D Spend</th>\n",
	" <th>Administration</th>\n",
	" <th>Marketing Spend</th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>165349.20</td>\n",
	" <td>136897.80</td>\n",
	" <td>471784.10</td>\n",
	" <td>192261.83</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>162597.70</td>\n",
	" <td>151377.59</td>\n",
	" <td>443898.53</td>\n",
	" <td>191792.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>153441.51</td>\n",
	" <td>101145.55</td>\n",
	" <td>407934.54</td>\n",
	" <td>191050.39</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>144372.41</td>\n",
	" <td>118671.85</td>\n",
	" <td>383199.62</td>\n",
	" <td>182901.99</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>142107.34</td>\n",
	" <td>91391.77</td>\n",
	" <td>366168.42</td>\n",
	" <td>166187.94</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" State_Florida State_New York R&D Spend Administration Marketing Spend \\\n",
	"0 0.0 1.0 165349.20 136897.80 471784.10 \n",
	"1 0.0 0.0 162597.70 151377.59 443898.53 \n",
	"2 1.0 0.0 153441.51 101145.55 407934.54 \n",
	"3 0.0 1.0 144372.41 118671.85 383199.62 \n",
	"4 1.0 0.0 142107.34 91391.77 366168.42 \n",
	"\n",
	" Profit \n",
	"0 192261.83 \n",
	"1 191792.06 \n",
	"2 191050.39 \n",
	"3 182901.99 \n",
	"4 166187.94 "
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn_pandas import DataFrameMapper\n",
	"from sklearn.preprocessing import LabelBinarizer\n",
	"mapper = DataFrameMapper(\n",
	" [('State',\n",
	" LabelBinarizer()\n",
	" )\n",
	" ],\n",
	" default=None,\n",
	" df_out=True\n",
	" )\n",
	"df = mapper.fit_transform(df)\n",
	"df = df[df.columns[1:]]\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>State_Florida</th>\n",
	" <th>State_New York</th>\n",
	" <th>R&D Spend</th>\n",
	" <th>Administration</th>\n",
	" <th>Marketing Spend</th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>165349.20</td>\n",
	" <td>136897.80</td>\n",
	" <td>471784.10</td>\n",
	" <td>192261.83</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>162597.70</td>\n",
	" <td>151377.59</td>\n",
	" <td>443898.53</td>\n",
	" <td>191792.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>144372.41</td>\n",
	" <td>118671.85</td>\n",
	" <td>383199.62</td>\n",
	" <td>182901.99</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>131876.90</td>\n",
	" <td>99814.71</td>\n",
	" <td>362861.36</td>\n",
	" <td>156991.12</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>134615.46</td>\n",
	" <td>147198.87</td>\n",
	" <td>127716.82</td>\n",
	" <td>156122.51</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" State_Florida State_New York R&D Spend Administration Marketing Spend \\\n",
	"0 0.0 1.0 165349.20 136897.80 471784.10 \n",
	"1 0.0 0.0 162597.70 151377.59 443898.53 \n",
	"3 0.0 1.0 144372.41 118671.85 383199.62 \n",
	"5 0.0 1.0 131876.90 99814.71 362861.36 \n",
	"6 0.0 0.0 134615.46 147198.87 127716.82 \n",
	"\n",
	" Profit \n",
	"0 192261.83 \n",
	"1 191792.06 \n",
	"3 182901.99 \n",
	"5 156991.12 \n",
	"6 156122.51 "
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"idx_train, idx_test = train_test_split(\n",
	" df.index,\n",
	" test_size=0.2,\n",
	" random_state=0)\n",
	"df_train = df.iloc[idx_train].sort_index()\n",
	"df_test = df.iloc[idx_test].sort_index()\n",
	"df_train.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/kit/UNIX/virtualenvs/engg/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n",
	" linalg.lstsq(X, y)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.linear_model import LinearRegression\n",
	"regressor = LinearRegression()\n",
	"regressor.fit(\n",
	" df_train[df.columns[:-1]].values,\n",
	" df_train[[df.columns[-1]]].values)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[178537.48221054],\n",
	" [167921.0656955 ],\n",
	" [132447.73845175],\n",
	" [132582.27760815],\n",
	" [113969.43533012],\n",
	" [116161.24230165],\n",
	" [103015.20159797],\n",
	" [ 98791.73374688],\n",
	" [ 67851.69209676],\n",
	" [ 71976.09851258]])"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"y_pred = regressor.predict(\n",
	" df_test[df.columns[:-1]].values)\n",
	"y_pred"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>191050.39</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>166187.94</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>146121.95</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>144259.40</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>22</th>\n",
	" <td>110352.25</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>27</th>\n",
	" <td>105008.31</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>28</th>\n",
	" <td>103282.38</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>31</th>\n",
	" <td>97483.56</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>38</th>\n",
	" <td>81229.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>41</th>\n",
	" <td>77798.83</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Profit\n",
	"2 191050.39\n",
	"4 166187.94\n",
	"10 146121.95\n",
	"11 144259.40\n",
	"22 110352.25\n",
	"27 105008.31\n",
	"28 103282.38\n",
	"31 97483.56\n",
	"38 81229.06\n",
	"41 77798.83"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_test[['Profit']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Ones</th>\n",
	" <th>State_Florida</th>\n",
	" <th>State_New York</th>\n",
	" <th>R&D Spend</th>\n",
	" <th>Administration</th>\n",
	" <th>Marketing Spend</th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>165349.20</td>\n",
	" <td>136897.80</td>\n",
	" <td>471784.10</td>\n",
	" <td>192261.83</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>0.0</td>\n",
	" <td>0.0</td>\n",
	" <td>162597.70</td>\n",
	" <td>151377.59</td>\n",
	" <td>443898.53</td>\n",
	" <td>191792.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>153441.51</td>\n",
	" <td>101145.55</td>\n",
	" <td>407934.54</td>\n",
	" <td>191050.39</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1</td>\n",
	" <td>0.0</td>\n",
	" <td>1.0</td>\n",
	" <td>144372.41</td>\n",
	" <td>118671.85</td>\n",
	" <td>383199.62</td>\n",
	" <td>182901.99</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1</td>\n",
	" <td>1.0</td>\n",
	" <td>0.0</td>\n",
	" <td>142107.34</td>\n",
	" <td>91391.77</td>\n",
	" <td>366168.42</td>\n",
	" <td>166187.94</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Ones State_Florida State_New York R&D Spend Administration \\\n",
	"0 1 0.0 1.0 165349.20 136897.80 \n",
	"1 1 0.0 0.0 162597.70 151377.59 \n",
	"2 1 1.0 0.0 153441.51 101145.55 \n",
	"3 1 0.0 1.0 144372.41 118671.85 \n",
	"4 1 1.0 0.0 142107.34 91391.77 \n",
	"\n",
	" Marketing Spend Profit \n",
	"0 471784.10 192261.83 \n",
	"1 443898.53 191792.06 \n",
	"2 407934.54 191050.39 \n",
	"3 383199.62 182901.99 \n",
	"4 366168.42 166187.94 "
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import statsmodels.formula.api as sm\n",
	"df.insert(0, 'Ones', 1)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"scrolled": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" OLS Regression Results \n",
	"==============================================================================\n",
	"Dep. Variable: y R-squared: 0.947\n",
	"Model: OLS Adj. R-squared: 0.945\n",
	"Method: Least Squares F-statistic: 849.8\n",
	"Date: Sun, 20 May 2018 Prob (F-statistic): 3.50e-32\n",
	"Time: 21:12:28 Log-Likelihood: -527.44\n",
	"No. Observations: 50 AIC: 1059.\n",
	"Df Residuals: 48 BIC: 1063.\n",
	"Df Model: 1 \n",
	"Covariance Type: nonrobust \n",
	"==============================================================================\n",
	" coef std err t P>\|t\| [0.025 0.975]\n",
	"------------------------------------------------------------------------------\n",
	"Ones 4.903e+04 2537.897 19.320 0.000 4.39e+04 5.41e+04\n",
	"R&D Spend 0.8543 0.029 29.151 0.000 0.795 0.913\n",
	"==============================================================================\n",
	"Omnibus: 13.727 Durbin-Watson: 1.116\n",
	"Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.536\n",
	"Skew: -0.911 Prob(JB): 9.44e-05\n",
	"Kurtosis: 5.361 Cond. No. 1.65e+05\n",
	"==============================================================================\n",
	"\n",
	"Warnings:\n",
	"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
	"[2] The condition number is large, 1.65e+05. This might indicate that there are\n",
	"strong multicollinearity or other numerical problems.\n"
	]
	}
	],
	"source": [
	"def backwardElimination(df, SL):\n",
	" df_opt = df.copy()\n",
	" inVars = df.columns.tolist()[:-2]\n",
	" numVars = len(inVars)\n",
	" while True:\n",
	" regressor = sm.OLS(\n",
	" endog=df_opt[['Profit']].values,\n",
	" exog=df_opt[inVars]).fit()\n",
	" if regressor.pvalues.max() < SL:\n",
	" break\n",
	" inVars.remove(regressor.pvalues.idxmax())\n",
	" cols = inVars\n",
	" cols.append(df.columns[-1])\n",
	" print(regressor.summary())\n",
	" return df_opt[cols]\n",
	"\n",
	"df_opt = backwardElimination(df, 0.05)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[104667.27805998],\n",
	" [134150.83410578],\n",
	" [135207.80019517],\n",
	" [ 72170.54428856],\n",
	" [179090.58602508],\n",
	" [109824.77386586],\n",
	" [ 65644.27773757],\n",
	" [100481.43277139],\n",
	" [111431.75202432],\n",
	" [169438.14843539]])"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"idx_train, idx_test = train_test_split(\n",
	" df_opt.index,\n",
	" test_size=0.2,\n",
	" random_state=0)\n",
	"\n",
	"df_train = df_opt.iloc[idx_train]\n",
	"df_test = df_opt.iloc[idx_test]\n",
	"\n",
	"regressor = LinearRegression()\n",
	"regressor.fit(\n",
	" df_train[df_train.columns[:-1]].values,\n",
	" df_train[df_train.columns[-1:]].values)\n",
	"\n",
	"regressor.predict(\n",
	" df_test[df_test.columns[:-1]])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Profit</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>28</th>\n",
	" <td>103282.38</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>144259.40</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>146121.95</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>41</th>\n",
	" <td>77798.83</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>191050.39</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>27</th>\n",
	" <td>105008.31</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>38</th>\n",
	" <td>81229.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>31</th>\n",
	" <td>97483.56</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>22</th>\n",
	" <td>110352.25</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>166187.94</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Profit\n",
	"28 103282.38\n",
	"11 144259.40\n",
	"10 146121.95\n",
	"41 77798.83\n",
	"2 191050.39\n",
	"27 105008.31\n",
	"38 81229.06\n",
	"31 97483.56\n",
	"22 110352.25\n",
	"4 166187.94"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df_test[df_test.columns[-1:]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}