Created
November 6, 2019 05:21
-
-
Save ianforme/6459aefb61323c529a51026e2e50bc5f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sys\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"from lightgbm import LGBMClassifier\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"from sklearn.model_selection import train_test_split, GridSearchCV\n", | |
"from sklearn.metrics import roc_auc_score\n", | |
"\n", | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"code_folding": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# read in raw data\n", | |
"raw_data = pd.read_csv('./data/crx.data', names=['A{}'.format(i) for i in range(1, 17)], na_values='?')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A1</th>\n", | |
" <th>A2</th>\n", | |
" <th>A3</th>\n", | |
" <th>A4</th>\n", | |
" <th>A5</th>\n", | |
" <th>A6</th>\n", | |
" <th>A7</th>\n", | |
" <th>A8</th>\n", | |
" <th>A9</th>\n", | |
" <th>A10</th>\n", | |
" <th>A11</th>\n", | |
" <th>A12</th>\n", | |
" <th>A13</th>\n", | |
" <th>A14</th>\n", | |
" <th>A15</th>\n", | |
" <th>A16</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>b</td>\n", | |
" <td>30.83</td>\n", | |
" <td>0.000</td>\n", | |
" <td>u</td>\n", | |
" <td>g</td>\n", | |
" <td>w</td>\n", | |
" <td>v</td>\n", | |
" <td>1.25</td>\n", | |
" <td>t</td>\n", | |
" <td>t</td>\n", | |
" <td>1</td>\n", | |
" <td>f</td>\n", | |
" <td>g</td>\n", | |
" <td>202.0</td>\n", | |
" <td>0</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>1</td>\n", | |
" <td>a</td>\n", | |
" <td>58.67</td>\n", | |
" <td>4.460</td>\n", | |
" <td>u</td>\n", | |
" <td>g</td>\n", | |
" <td>q</td>\n", | |
" <td>h</td>\n", | |
" <td>3.04</td>\n", | |
" <td>t</td>\n", | |
" <td>t</td>\n", | |
" <td>6</td>\n", | |
" <td>f</td>\n", | |
" <td>g</td>\n", | |
" <td>43.0</td>\n", | |
" <td>560</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>2</td>\n", | |
" <td>a</td>\n", | |
" <td>24.50</td>\n", | |
" <td>0.500</td>\n", | |
" <td>u</td>\n", | |
" <td>g</td>\n", | |
" <td>q</td>\n", | |
" <td>h</td>\n", | |
" <td>1.50</td>\n", | |
" <td>t</td>\n", | |
" <td>f</td>\n", | |
" <td>0</td>\n", | |
" <td>f</td>\n", | |
" <td>g</td>\n", | |
" <td>280.0</td>\n", | |
" <td>824</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>3</td>\n", | |
" <td>b</td>\n", | |
" <td>27.83</td>\n", | |
" <td>1.540</td>\n", | |
" <td>u</td>\n", | |
" <td>g</td>\n", | |
" <td>w</td>\n", | |
" <td>v</td>\n", | |
" <td>3.75</td>\n", | |
" <td>t</td>\n", | |
" <td>t</td>\n", | |
" <td>5</td>\n", | |
" <td>t</td>\n", | |
" <td>g</td>\n", | |
" <td>100.0</td>\n", | |
" <td>3</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <td>4</td>\n", | |
" <td>b</td>\n", | |
" <td>20.17</td>\n", | |
" <td>5.625</td>\n", | |
" <td>u</td>\n", | |
" <td>g</td>\n", | |
" <td>w</td>\n", | |
" <td>v</td>\n", | |
" <td>1.71</td>\n", | |
" <td>t</td>\n", | |
" <td>f</td>\n", | |
" <td>0</td>\n", | |
" <td>f</td>\n", | |
" <td>s</td>\n", | |
" <td>120.0</td>\n", | |
" <td>0</td>\n", | |
" <td>+</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16\n", | |
"0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n", | |
"1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n", | |
"2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n", | |
"3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n", | |
"4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"raw_data.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# remove data with missing values\n", | |
"raw_data = raw_data.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# process features and targets, categorical features are one-hot encoded\n", | |
"raw_data['A16'] = raw_data['A16'].replace({'+':1, '-':0})\n", | |
"cat_features = pd.get_dummies(raw_data[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']])\n", | |
"ord_features = raw_data[['A11', 'A14', 'A15', 'A2', 'A3', 'A8']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# finalise training set\n", | |
"train = pd.concat([cat_features, ord_features], axis=1).reset_index(drop=True)\n", | |
"test = raw_data['A16'].reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# train test split\n", | |
"X_train, X_test, y_train, y_test = train_test_split(train, test, random_state=42, test_size = 0.2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Benchmark 1: LR" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"LR ROCAUC score: 0.82\n" | |
] | |
} | |
], | |
"source": [ | |
"lr = LogisticRegression(random_state=42).fit(X_train, y_train)\n", | |
"lr_pred = lr.predict(X_test)\n", | |
"print(\"LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, lr_pred)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Benchmark 2: GBDT (with GridSearch for best param)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"params = {\n", | |
" 'num_leaves': [20, 30, 40, 50, 60, 70],\n", | |
" 'learning_rate': [0.05, 0.01],\n", | |
" 'n_estimators': [100, 300, 500],\n", | |
" 'subsample': [0.95],\n", | |
" 'colsample_bytree': [0.95],\n", | |
" 'n_jobs': [7],\n", | |
" 'random_state': [42]\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fitting 5 folds for each of 36 candidates, totalling 180 fits\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", | |
"[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 26.0s finished\n" | |
] | |
} | |
], | |
"source": [ | |
"gcv = GridSearchCV(LGBMClassifier(), params, cv=5, verbose=1).fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GBDT ROCAUC score: 0.83\n" | |
] | |
} | |
], | |
"source": [ | |
"gbm = gcv.best_estimator_\n", | |
"gbm_pred = gbm.predict(X_test)\n", | |
"print(\"GBDT ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbm_pred)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Main Dish: GBDT + LR" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of Leaves: 20\n", | |
"Number of Trees: 300\n" | |
] | |
} | |
], | |
"source": [ | |
"num_leaves = gbm.num_leaves\n", | |
"num_trees = gbm.n_estimators\n", | |
"print(\"Number of Leaves: {}\".format(num_leaves))\n", | |
"print(\"Number of Trees: {}\".format(num_trees))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"gbm_int_train = gbm.predict(X_train, pred_leaf=True)\n", | |
"gbm_int_test = gbm.predict(X_test, pred_leaf=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[16 7 7 7 7 8 18 18 8 8 7 8 8 18 8 8 5 8 7 17 7 7 12 18\n", | |
" 19 19 18 18 19 19 18 18 18 12 17 17 17 16 17 18 18 19 18 18 19 17 18 17\n", | |
" 11 18 19 18 19 16 16 14 11 19 19 16 16 19 16 17 17 19 16 15 16 17 19 17\n", | |
" 17 19 18 18 19 18 17 17 19 18 17 18 17 17 15 18 19 18 19 19 17 18 19 18\n", | |
" 18 19 14 18 19 18 19 19 17 18 19 17 18 19 18 17 19 18 18 18 18 19 17 19\n", | |
" 15 19 14 19 18 19 17 19 19 18 19 19 19 19 19 15 16 19 19 19 19 19 19 19\n", | |
" 19 15 19 19 19 19 19 19 19 14 19 19 19 19 19 14 17 18 19 17 12 16 19 14\n", | |
" 16 19 18 16 19 16 15 18 19 19 19 15 18 19 14 16 18 19 14 19 19 19 19 19\n", | |
" 14 19 19 19 15 19 19 17 18 13 19 19 12 18 16 14 18 18 19 16 19 13 19 15\n", | |
" 19 19 19 19 18 18 19 16 18 16 14 18 16 19 19 18 19 16 17 19 18 19 15 18\n", | |
" 19 18 17 17 15 19 19 17 14 17 17 15 15 18 16 18 18 13 18 19 18 17 18 15\n", | |
" 17 6 16 13 13 18 17 13 18 16 17 14 6 19 16 11 16 16 10 12 16 12 18 7\n", | |
" 10 12 11 2 8 18 19 2 12 15 4 15]\n" | |
] | |
} | |
], | |
"source": [ | |
"print(gbm_int_train[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def form_onehot(int_df):\n", | |
" output_df = []\n", | |
" for i in int_df:\n", | |
" row_l = []\n", | |
" for j in i:\n", | |
" temp_l = np.zeros(num_leaves)\n", | |
" temp_l[j] = 1\n", | |
" row_l.extend(temp_l)\n", | |
" output_df.append(row_l)\n", | |
" return output_df " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 419 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"stack_train = form_onehot(gbm_int_train)\n", | |
"stack_test = form_onehot(gbm_int_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"RAM Taken by Train: 4272 bytes\n", | |
"RAM Taken by Test: 1248 bytes\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"RAM Taken by Train: {} bytes\".format(sys.getsizeof(stack_train)))\n", | |
"print(\"RAM Taken by Test: {} bytes\".format(sys.getsizeof(stack_test)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GBDT-LR ROCAUC score: 0.86\n" | |
] | |
} | |
], | |
"source": [ | |
"gbdt_lr = LogisticRegression(penalty='l1', random_state=42).fit(stack_train, y_train)\n", | |
"gbdt_lr_pred = gbdt_lr.predict(stack_test)\n", | |
"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_pred)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Practical concern: Data Sparsity" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from scipy.sparse import vstack, csr_matrix\n", | |
"\n", | |
"def form_onehot_spar(int_df):\n", | |
" output_df = []\n", | |
" for i in int_df:\n", | |
" row_l = []\n", | |
" for j in i:\n", | |
" temp_l = np.zeros(num_leaves)\n", | |
" temp_l[j] = 1\n", | |
" row_l.extend(temp_l)\n", | |
" row_l = csr_matrix(row_l)\n", | |
" output_df.append(row_l)\n", | |
" return vstack(output_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 898 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"stack_train_spar = form_onehot_spar(gbm_int_train)\n", | |
"stack_test_spar = form_onehot_spar(gbm_int_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"RAM Taken by Sparse Train: 56 bytes\n", | |
"RAM Taken by Sparse Test: 56 bytes\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"RAM Taken by Sparse Train: {} bytes\".format(sys.getsizeof(stack_train_spar)))\n", | |
"print(\"RAM Taken by Sparse Test: {} bytes\".format(sys.getsizeof(stack_test_spar)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"GBDT-LR ROCAUC score: 0.86\n" | |
] | |
} | |
], | |
"source": [ | |
"gbdt_lr_spar = LogisticRegression(penalty='l1', random_state=42).fit(stack_train_spar, y_train)\n", | |
"gbdt_lr_spar_pred = gbdt_lr_spar.predict(stack_test_spar)\n", | |
"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_spar_pred)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(653, 16)" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"raw_data.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment