Last active
May 20, 2018 13:24
-
-
Save kitmonisit/900affc23e780ac853c439b30fce5cd0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R&D Spend | Administration | Marketing Spend | State | Profit | |
---|---|---|---|---|---|
165349.2 | 136897.8 | 471784.1 | New York | 192261.83 | |
162597.7 | 151377.59 | 443898.53 | California | 191792.06 | |
153441.51 | 101145.55 | 407934.54 | Florida | 191050.39 | |
144372.41 | 118671.85 | 383199.62 | New York | 182901.99 | |
142107.34 | 91391.77 | 366168.42 | Florida | 166187.94 | |
131876.9 | 99814.71 | 362861.36 | New York | 156991.12 | |
134615.46 | 147198.87 | 127716.82 | California | 156122.51 | |
130298.13 | 145530.06 | 323876.68 | Florida | 155752.6 | |
120542.52 | 148718.95 | 311613.29 | New York | 152211.77 | |
123334.88 | 108679.17 | 304981.62 | California | 149759.96 | |
101913.08 | 110594.11 | 229160.95 | Florida | 146121.95 | |
100671.96 | 91790.61 | 249744.55 | California | 144259.4 | |
93863.75 | 127320.38 | 249839.44 | Florida | 141585.52 | |
91992.39 | 135495.07 | 252664.93 | California | 134307.35 | |
119943.24 | 156547.42 | 256512.92 | Florida | 132602.65 | |
114523.61 | 122616.84 | 261776.23 | New York | 129917.04 | |
78013.11 | 121597.55 | 264346.06 | California | 126992.93 | |
94657.16 | 145077.58 | 282574.31 | New York | 125370.37 | |
91749.16 | 114175.79 | 294919.57 | Florida | 124266.9 | |
86419.7 | 153514.11 | 0 | New York | 122776.86 | |
76253.86 | 113867.3 | 298664.47 | California | 118474.03 | |
78389.47 | 153773.43 | 299737.29 | New York | 111313.02 | |
73994.56 | 122782.75 | 303319.26 | Florida | 110352.25 | |
67532.53 | 105751.03 | 304768.73 | Florida | 108733.99 | |
77044.01 | 99281.34 | 140574.81 | New York | 108552.04 | |
64664.71 | 139553.16 | 137962.62 | California | 107404.34 | |
75328.87 | 144135.98 | 134050.07 | Florida | 105733.54 | |
72107.6 | 127864.55 | 353183.81 | New York | 105008.31 | |
66051.52 | 182645.56 | 118148.2 | Florida | 103282.38 | |
65605.48 | 153032.06 | 107138.38 | New York | 101004.64 | |
61994.48 | 115641.28 | 91131.24 | Florida | 99937.59 | |
61136.38 | 152701.92 | 88218.23 | New York | 97483.56 | |
63408.86 | 129219.61 | 46085.25 | California | 97427.84 | |
55493.95 | 103057.49 | 214634.81 | Florida | 96778.92 | |
46426.07 | 157693.92 | 210797.67 | California | 96712.8 | |
46014.02 | 85047.44 | 205517.64 | New York | 96479.51 | |
28663.76 | 127056.21 | 201126.82 | Florida | 90708.19 | |
44069.95 | 51283.14 | 197029.42 | California | 89949.14 | |
20229.59 | 65947.93 | 185265.1 | New York | 81229.06 | |
38558.51 | 82982.09 | 174999.3 | California | 81005.76 | |
28754.33 | 118546.05 | 172795.67 | California | 78239.91 | |
27892.92 | 84710.77 | 164470.71 | Florida | 77798.83 | |
23640.93 | 96189.63 | 148001.11 | California | 71498.49 | |
15505.73 | 127382.3 | 35534.17 | New York | 69758.98 | |
22177.74 | 154806.14 | 28334.72 | California | 65200.33 | |
1000.23 | 124153.04 | 1903.93 | New York | 64926.08 | |
1315.46 | 115816.21 | 297114.46 | Florida | 49490.75 | |
0 | 135426.92 | 0 | California | 42559.73 | |
542.05 | 51743.15 | 0 | New York | 35673.41 | |
0 | 116983.8 | 45173.06 | California | 14681.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import pickle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>R&D Spend</th>\n", | |
" <th>Administration</th>\n", | |
" <th>Marketing Spend</th>\n", | |
" <th>State</th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>165349.20</td>\n", | |
" <td>136897.80</td>\n", | |
" <td>471784.10</td>\n", | |
" <td>New York</td>\n", | |
" <td>192261.83</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>162597.70</td>\n", | |
" <td>151377.59</td>\n", | |
" <td>443898.53</td>\n", | |
" <td>California</td>\n", | |
" <td>191792.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>153441.51</td>\n", | |
" <td>101145.55</td>\n", | |
" <td>407934.54</td>\n", | |
" <td>Florida</td>\n", | |
" <td>191050.39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>144372.41</td>\n", | |
" <td>118671.85</td>\n", | |
" <td>383199.62</td>\n", | |
" <td>New York</td>\n", | |
" <td>182901.99</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>142107.34</td>\n", | |
" <td>91391.77</td>\n", | |
" <td>366168.42</td>\n", | |
" <td>Florida</td>\n", | |
" <td>166187.94</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" R&D Spend Administration Marketing Spend State Profit\n", | |
"0 165349.20 136897.80 471784.10 New York 192261.83\n", | |
"1 162597.70 151377.59 443898.53 California 191792.06\n", | |
"2 153441.51 101145.55 407934.54 Florida 191050.39\n", | |
"3 144372.41 118671.85 383199.62 New York 182901.99\n", | |
"4 142107.34 91391.77 366168.42 Florida 166187.94" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.read_csv('./50_Startups.csv')\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>State_Florida</th>\n", | |
" <th>State_New York</th>\n", | |
" <th>R&D Spend</th>\n", | |
" <th>Administration</th>\n", | |
" <th>Marketing Spend</th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>165349.20</td>\n", | |
" <td>136897.80</td>\n", | |
" <td>471784.10</td>\n", | |
" <td>192261.83</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>162597.70</td>\n", | |
" <td>151377.59</td>\n", | |
" <td>443898.53</td>\n", | |
" <td>191792.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>153441.51</td>\n", | |
" <td>101145.55</td>\n", | |
" <td>407934.54</td>\n", | |
" <td>191050.39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>144372.41</td>\n", | |
" <td>118671.85</td>\n", | |
" <td>383199.62</td>\n", | |
" <td>182901.99</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>142107.34</td>\n", | |
" <td>91391.77</td>\n", | |
" <td>366168.42</td>\n", | |
" <td>166187.94</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" State_Florida State_New York R&D Spend Administration Marketing Spend \\\n", | |
"0 0.0 1.0 165349.20 136897.80 471784.10 \n", | |
"1 0.0 0.0 162597.70 151377.59 443898.53 \n", | |
"2 1.0 0.0 153441.51 101145.55 407934.54 \n", | |
"3 0.0 1.0 144372.41 118671.85 383199.62 \n", | |
"4 1.0 0.0 142107.34 91391.77 366168.42 \n", | |
"\n", | |
" Profit \n", | |
"0 192261.83 \n", | |
"1 191792.06 \n", | |
"2 191050.39 \n", | |
"3 182901.99 \n", | |
"4 166187.94 " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn_pandas import DataFrameMapper\n", | |
"from sklearn.preprocessing import LabelBinarizer\n", | |
"mapper = DataFrameMapper(\n", | |
" [('State',\n", | |
" LabelBinarizer()\n", | |
" )\n", | |
" ],\n", | |
" default=None,\n", | |
" df_out=True\n", | |
" )\n", | |
"df = mapper.fit_transform(df)\n", | |
"df = df[df.columns[1:]]\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>State_Florida</th>\n", | |
" <th>State_New York</th>\n", | |
" <th>R&D Spend</th>\n", | |
" <th>Administration</th>\n", | |
" <th>Marketing Spend</th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>165349.20</td>\n", | |
" <td>136897.80</td>\n", | |
" <td>471784.10</td>\n", | |
" <td>192261.83</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>162597.70</td>\n", | |
" <td>151377.59</td>\n", | |
" <td>443898.53</td>\n", | |
" <td>191792.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>144372.41</td>\n", | |
" <td>118671.85</td>\n", | |
" <td>383199.62</td>\n", | |
" <td>182901.99</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>131876.90</td>\n", | |
" <td>99814.71</td>\n", | |
" <td>362861.36</td>\n", | |
" <td>156991.12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>134615.46</td>\n", | |
" <td>147198.87</td>\n", | |
" <td>127716.82</td>\n", | |
" <td>156122.51</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" State_Florida State_New York R&D Spend Administration Marketing Spend \\\n", | |
"0 0.0 1.0 165349.20 136897.80 471784.10 \n", | |
"1 0.0 0.0 162597.70 151377.59 443898.53 \n", | |
"3 0.0 1.0 144372.41 118671.85 383199.62 \n", | |
"5 0.0 1.0 131876.90 99814.71 362861.36 \n", | |
"6 0.0 0.0 134615.46 147198.87 127716.82 \n", | |
"\n", | |
" Profit \n", | |
"0 192261.83 \n", | |
"1 191792.06 \n", | |
"3 182901.99 \n", | |
"5 156991.12 \n", | |
"6 156122.51 " | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"idx_train, idx_test = train_test_split(\n", | |
" df.index,\n", | |
" test_size=0.2,\n", | |
" random_state=0)\n", | |
"df_train = df.iloc[idx_train].sort_index()\n", | |
"df_test = df.iloc[idx_test].sort_index()\n", | |
"df_train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/kit/UNIX/virtualenvs/engg/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.\n", | |
" linalg.lstsq(X, y)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.linear_model import LinearRegression\n", | |
"regressor = LinearRegression()\n", | |
"regressor.fit(\n", | |
" df_train[df.columns[:-1]].values,\n", | |
" df_train[[df.columns[-1]]].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[178537.48221054],\n", | |
" [167921.0656955 ],\n", | |
" [132447.73845175],\n", | |
" [132582.27760815],\n", | |
" [113969.43533012],\n", | |
" [116161.24230165],\n", | |
" [103015.20159797],\n", | |
" [ 98791.73374688],\n", | |
" [ 67851.69209676],\n", | |
" [ 71976.09851258]])" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"y_pred = regressor.predict(\n", | |
" df_test[df.columns[:-1]].values)\n", | |
"y_pred" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>191050.39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>166187.94</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>146121.95</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>144259.40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>110352.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>105008.31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>103282.38</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>97483.56</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>81229.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41</th>\n", | |
" <td>77798.83</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Profit\n", | |
"2 191050.39\n", | |
"4 166187.94\n", | |
"10 146121.95\n", | |
"11 144259.40\n", | |
"22 110352.25\n", | |
"27 105008.31\n", | |
"28 103282.38\n", | |
"31 97483.56\n", | |
"38 81229.06\n", | |
"41 77798.83" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_test[['Profit']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Ones</th>\n", | |
" <th>State_Florida</th>\n", | |
" <th>State_New York</th>\n", | |
" <th>R&D Spend</th>\n", | |
" <th>Administration</th>\n", | |
" <th>Marketing Spend</th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>165349.20</td>\n", | |
" <td>136897.80</td>\n", | |
" <td>471784.10</td>\n", | |
" <td>192261.83</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>162597.70</td>\n", | |
" <td>151377.59</td>\n", | |
" <td>443898.53</td>\n", | |
" <td>191792.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>153441.51</td>\n", | |
" <td>101145.55</td>\n", | |
" <td>407934.54</td>\n", | |
" <td>191050.39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>0.0</td>\n", | |
" <td>1.0</td>\n", | |
" <td>144372.41</td>\n", | |
" <td>118671.85</td>\n", | |
" <td>383199.62</td>\n", | |
" <td>182901.99</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>1.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>142107.34</td>\n", | |
" <td>91391.77</td>\n", | |
" <td>366168.42</td>\n", | |
" <td>166187.94</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Ones State_Florida State_New York R&D Spend Administration \\\n", | |
"0 1 0.0 1.0 165349.20 136897.80 \n", | |
"1 1 0.0 0.0 162597.70 151377.59 \n", | |
"2 1 1.0 0.0 153441.51 101145.55 \n", | |
"3 1 0.0 1.0 144372.41 118671.85 \n", | |
"4 1 1.0 0.0 142107.34 91391.77 \n", | |
"\n", | |
" Marketing Spend Profit \n", | |
"0 471784.10 192261.83 \n", | |
"1 443898.53 191792.06 \n", | |
"2 407934.54 191050.39 \n", | |
"3 383199.62 182901.99 \n", | |
"4 366168.42 166187.94 " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import statsmodels.formula.api as sm\n", | |
"df.insert(0, 'Ones', 1)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" OLS Regression Results \n", | |
"==============================================================================\n", | |
"Dep. Variable: y R-squared: 0.947\n", | |
"Model: OLS Adj. R-squared: 0.945\n", | |
"Method: Least Squares F-statistic: 849.8\n", | |
"Date: Sun, 20 May 2018 Prob (F-statistic): 3.50e-32\n", | |
"Time: 21:12:28 Log-Likelihood: -527.44\n", | |
"No. Observations: 50 AIC: 1059.\n", | |
"Df Residuals: 48 BIC: 1063.\n", | |
"Df Model: 1 \n", | |
"Covariance Type: nonrobust \n", | |
"==============================================================================\n", | |
" coef std err t P>|t| [0.025 0.975]\n", | |
"------------------------------------------------------------------------------\n", | |
"Ones 4.903e+04 2537.897 19.320 0.000 4.39e+04 5.41e+04\n", | |
"R&D Spend 0.8543 0.029 29.151 0.000 0.795 0.913\n", | |
"==============================================================================\n", | |
"Omnibus: 13.727 Durbin-Watson: 1.116\n", | |
"Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.536\n", | |
"Skew: -0.911 Prob(JB): 9.44e-05\n", | |
"Kurtosis: 5.361 Cond. No. 1.65e+05\n", | |
"==============================================================================\n", | |
"\n", | |
"Warnings:\n", | |
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", | |
"[2] The condition number is large, 1.65e+05. This might indicate that there are\n", | |
"strong multicollinearity or other numerical problems.\n" | |
] | |
} | |
], | |
"source": [ | |
"def backwardElimination(df, SL):\n", | |
" df_opt = df.copy()\n", | |
" inVars = df.columns.tolist()[:-2]\n", | |
" numVars = len(inVars)\n", | |
" while True:\n", | |
" regressor = sm.OLS(\n", | |
" endog=df_opt[['Profit']].values,\n", | |
" exog=df_opt[inVars]).fit()\n", | |
" if regressor.pvalues.max() < SL:\n", | |
" break\n", | |
" inVars.remove(regressor.pvalues.idxmax())\n", | |
" cols = inVars\n", | |
" cols.append(df.columns[-1])\n", | |
" print(regressor.summary())\n", | |
" return df_opt[cols]\n", | |
"\n", | |
"df_opt = backwardElimination(df, 0.05)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[104667.27805998],\n", | |
" [134150.83410578],\n", | |
" [135207.80019517],\n", | |
" [ 72170.54428856],\n", | |
" [179090.58602508],\n", | |
" [109824.77386586],\n", | |
" [ 65644.27773757],\n", | |
" [100481.43277139],\n", | |
" [111431.75202432],\n", | |
" [169438.14843539]])" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"idx_train, idx_test = train_test_split(\n", | |
" df_opt.index,\n", | |
" test_size=0.2,\n", | |
" random_state=0)\n", | |
"\n", | |
"df_train = df_opt.iloc[idx_train]\n", | |
"df_test = df_opt.iloc[idx_test]\n", | |
"\n", | |
"regressor = LinearRegression()\n", | |
"regressor.fit(\n", | |
" df_train[df_train.columns[:-1]].values,\n", | |
" df_train[df_train.columns[-1:]].values)\n", | |
"\n", | |
"regressor.predict(\n", | |
" df_test[df_test.columns[:-1]])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Profit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>103282.38</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>144259.40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>146121.95</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41</th>\n", | |
" <td>77798.83</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>191050.39</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>105008.31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>81229.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>97483.56</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>110352.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>166187.94</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Profit\n", | |
"28 103282.38\n", | |
"11 144259.40\n", | |
"10 146121.95\n", | |
"41 77798.83\n", | |
"2 191050.39\n", | |
"27 105008.31\n", | |
"38 81229.06\n", | |
"31 97483.56\n", | |
"22 110352.25\n", | |
"4 166187.94" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_test[df_test.columns[-1:]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment