Skip to content

Instantly share code, notes, and snippets.

@ianforme
Created November 6, 2019 05:21
Show Gist options
  • Save ianforme/6459aefb61323c529a51026e2e50bc5f to your computer and use it in GitHub Desktop.
Save ianforme/6459aefb61323c529a51026e2e50bc5f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from lightgbm import LGBMClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"code_folding": []
},
"outputs": [],
"source": [
"# read in raw data\n",
"raw_data = pd.read_csv('./data/crx.data', names=['A{}'.format(i) for i in range(1, 17)], na_values='?')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A1</th>\n",
" <th>A2</th>\n",
" <th>A3</th>\n",
" <th>A4</th>\n",
" <th>A5</th>\n",
" <th>A6</th>\n",
" <th>A7</th>\n",
" <th>A8</th>\n",
" <th>A9</th>\n",
" <th>A10</th>\n",
" <th>A11</th>\n",
" <th>A12</th>\n",
" <th>A13</th>\n",
" <th>A14</th>\n",
" <th>A15</th>\n",
" <th>A16</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>b</td>\n",
" <td>30.83</td>\n",
" <td>0.000</td>\n",
" <td>u</td>\n",
" <td>g</td>\n",
" <td>w</td>\n",
" <td>v</td>\n",
" <td>1.25</td>\n",
" <td>t</td>\n",
" <td>t</td>\n",
" <td>1</td>\n",
" <td>f</td>\n",
" <td>g</td>\n",
" <td>202.0</td>\n",
" <td>0</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>58.67</td>\n",
" <td>4.460</td>\n",
" <td>u</td>\n",
" <td>g</td>\n",
" <td>q</td>\n",
" <td>h</td>\n",
" <td>3.04</td>\n",
" <td>t</td>\n",
" <td>t</td>\n",
" <td>6</td>\n",
" <td>f</td>\n",
" <td>g</td>\n",
" <td>43.0</td>\n",
" <td>560</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" <td>24.50</td>\n",
" <td>0.500</td>\n",
" <td>u</td>\n",
" <td>g</td>\n",
" <td>q</td>\n",
" <td>h</td>\n",
" <td>1.50</td>\n",
" <td>t</td>\n",
" <td>f</td>\n",
" <td>0</td>\n",
" <td>f</td>\n",
" <td>g</td>\n",
" <td>280.0</td>\n",
" <td>824</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>b</td>\n",
" <td>27.83</td>\n",
" <td>1.540</td>\n",
" <td>u</td>\n",
" <td>g</td>\n",
" <td>w</td>\n",
" <td>v</td>\n",
" <td>3.75</td>\n",
" <td>t</td>\n",
" <td>t</td>\n",
" <td>5</td>\n",
" <td>t</td>\n",
" <td>g</td>\n",
" <td>100.0</td>\n",
" <td>3</td>\n",
" <td>+</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>b</td>\n",
" <td>20.17</td>\n",
" <td>5.625</td>\n",
" <td>u</td>\n",
" <td>g</td>\n",
" <td>w</td>\n",
" <td>v</td>\n",
" <td>1.71</td>\n",
" <td>t</td>\n",
" <td>f</td>\n",
" <td>0</td>\n",
" <td>f</td>\n",
" <td>s</td>\n",
" <td>120.0</td>\n",
" <td>0</td>\n",
" <td>+</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16\n",
"0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n",
"1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n",
"2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n",
"3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n",
"4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# remove data with missing values\n",
"raw_data = raw_data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# process features and targets, categorical features are one-hot encoded\n",
"raw_data['A16'] = raw_data['A16'].replace({'+':1, '-':0})\n",
"cat_features = pd.get_dummies(raw_data[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']])\n",
"ord_features = raw_data[['A11', 'A14', 'A15', 'A2', 'A3', 'A8']]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# finalise training set\n",
"train = pd.concat([cat_features, ord_features], axis=1).reset_index(drop=True)\n",
"test = raw_data['A16'].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# train test split\n",
"X_train, X_test, y_train, y_test = train_test_split(train, test, random_state=42, test_size = 0.2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Benchmark 1: LR"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LR ROCAUC score: 0.82\n"
]
}
],
"source": [
"lr = LogisticRegression(random_state=42).fit(X_train, y_train)\n",
"lr_pred = lr.predict(X_test)\n",
"print(\"LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, lr_pred)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Benchmark 2: GBDT (with GridSearch for best param)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"params = {\n",
" 'num_leaves': [20, 30, 40, 50, 60, 70],\n",
" 'learning_rate': [0.05, 0.01],\n",
" 'n_estimators': [100, 300, 500],\n",
" 'subsample': [0.95],\n",
" 'colsample_bytree': [0.95],\n",
" 'n_jobs': [7],\n",
" 'random_state': [42]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 36 candidates, totalling 180 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 26.0s finished\n"
]
}
],
"source": [
"gcv = GridSearchCV(LGBMClassifier(), params, cv=5, verbose=1).fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBDT ROCAUC score: 0.83\n"
]
}
],
"source": [
"gbm = gcv.best_estimator_\n",
"gbm_pred = gbm.predict(X_test)\n",
"print(\"GBDT ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbm_pred)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Main Dish: GBDT + LR"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Leaves: 20\n",
"Number of Trees: 300\n"
]
}
],
"source": [
"num_leaves = gbm.num_leaves\n",
"num_trees = gbm.n_estimators\n",
"print(\"Number of Leaves: {}\".format(num_leaves))\n",
"print(\"Number of Trees: {}\".format(num_trees))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"gbm_int_train = gbm.predict(X_train, pred_leaf=True)\n",
"gbm_int_test = gbm.predict(X_test, pred_leaf=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[16 7 7 7 7 8 18 18 8 8 7 8 8 18 8 8 5 8 7 17 7 7 12 18\n",
" 19 19 18 18 19 19 18 18 18 12 17 17 17 16 17 18 18 19 18 18 19 17 18 17\n",
" 11 18 19 18 19 16 16 14 11 19 19 16 16 19 16 17 17 19 16 15 16 17 19 17\n",
" 17 19 18 18 19 18 17 17 19 18 17 18 17 17 15 18 19 18 19 19 17 18 19 18\n",
" 18 19 14 18 19 18 19 19 17 18 19 17 18 19 18 17 19 18 18 18 18 19 17 19\n",
" 15 19 14 19 18 19 17 19 19 18 19 19 19 19 19 15 16 19 19 19 19 19 19 19\n",
" 19 15 19 19 19 19 19 19 19 14 19 19 19 19 19 14 17 18 19 17 12 16 19 14\n",
" 16 19 18 16 19 16 15 18 19 19 19 15 18 19 14 16 18 19 14 19 19 19 19 19\n",
" 14 19 19 19 15 19 19 17 18 13 19 19 12 18 16 14 18 18 19 16 19 13 19 15\n",
" 19 19 19 19 18 18 19 16 18 16 14 18 16 19 19 18 19 16 17 19 18 19 15 18\n",
" 19 18 17 17 15 19 19 17 14 17 17 15 15 18 16 18 18 13 18 19 18 17 18 15\n",
" 17 6 16 13 13 18 17 13 18 16 17 14 6 19 16 11 16 16 10 12 16 12 18 7\n",
" 10 12 11 2 8 18 19 2 12 15 4 15]\n"
]
}
],
"source": [
"print(gbm_int_train[0])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def form_onehot(int_df):\n",
" output_df = []\n",
" for i in int_df:\n",
" row_l = []\n",
" for j in i:\n",
" temp_l = np.zeros(num_leaves)\n",
" temp_l[j] = 1\n",
" row_l.extend(temp_l)\n",
" output_df.append(row_l)\n",
" return output_df "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 419 ms\n"
]
}
],
"source": [
"%%time\n",
"stack_train = form_onehot(gbm_int_train)\n",
"stack_test = form_onehot(gbm_int_test)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RAM Taken by Train: 4272 bytes\n",
"RAM Taken by Test: 1248 bytes\n"
]
}
],
"source": [
"print(\"RAM Taken by Train: {} bytes\".format(sys.getsizeof(stack_train)))\n",
"print(\"RAM Taken by Test: {} bytes\".format(sys.getsizeof(stack_test)))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBDT-LR ROCAUC score: 0.86\n"
]
}
],
"source": [
"gbdt_lr = LogisticRegression(penalty='l1', random_state=42).fit(stack_train, y_train)\n",
"gbdt_lr_pred = gbdt_lr.predict(stack_test)\n",
"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_pred)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Practical concern: Data Sparsity"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from scipy.sparse import vstack, csr_matrix\n",
"\n",
"def form_onehot_spar(int_df):\n",
" output_df = []\n",
" for i in int_df:\n",
" row_l = []\n",
" for j in i:\n",
" temp_l = np.zeros(num_leaves)\n",
" temp_l[j] = 1\n",
" row_l.extend(temp_l)\n",
" row_l = csr_matrix(row_l)\n",
" output_df.append(row_l)\n",
" return vstack(output_df)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 898 ms\n"
]
}
],
"source": [
"%%time\n",
"stack_train_spar = form_onehot_spar(gbm_int_train)\n",
"stack_test_spar = form_onehot_spar(gbm_int_test)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RAM Taken by Sparse Train: 56 bytes\n",
"RAM Taken by Sparse Test: 56 bytes\n"
]
}
],
"source": [
"print(\"RAM Taken by Sparse Train: {} bytes\".format(sys.getsizeof(stack_train_spar)))\n",
"print(\"RAM Taken by Sparse Test: {} bytes\".format(sys.getsizeof(stack_test_spar)))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBDT-LR ROCAUC score: 0.86\n"
]
}
],
"source": [
"gbdt_lr_spar = LogisticRegression(penalty='l1', random_state=42).fit(stack_train_spar, y_train)\n",
"gbdt_lr_spar_pred = gbdt_lr_spar.predict(stack_test_spar)\n",
"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_spar_pred)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(653, 16)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment