ianforme · November 6, 2019 05:21
diff --git a/dbdt-lr-example.ipynb b/dbdt-lr-example.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.metrics import roc_auc_score\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "# read in raw data\n",
    "raw_data = pd.read_csv('./data/crx.data', names=['A{}'.format(i) for i in range(1, 17)], na_values='?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9</th>\n",
       "      <th>A10</th>\n",
       "      <th>A11</th>\n",
       "      <th>A12</th>\n",
       "      <th>A13</th>\n",
       "      <th>A14</th>\n",
       "      <th>A15</th>\n",
       "      <th>A16</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>b</td>\n",
       "      <td>30.83</td>\n",
       "      <td>0.000</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.25</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>1</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>202.0</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "      <td>58.67</td>\n",
       "      <td>4.460</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>3.04</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>43.0</td>\n",
       "      <td>560</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>a</td>\n",
       "      <td>24.50</td>\n",
       "      <td>0.500</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>1.50</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>280.0</td>\n",
       "      <td>824</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>b</td>\n",
       "      <td>27.83</td>\n",
       "      <td>1.540</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>3.75</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>5</td>\n",
       "      <td>t</td>\n",
       "      <td>g</td>\n",
       "      <td>100.0</td>\n",
       "      <td>3</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>b</td>\n",
       "      <td>20.17</td>\n",
       "      <td>5.625</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.71</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>s</td>\n",
       "      <td>120.0</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 A16\n",
       "0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  202.0    0   +\n",
       "1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g   43.0  560   +\n",
       "2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  280.0  824   +\n",
       "3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  100.0    3   +\n",
       "4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  120.0    0   +"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove data with missing values\n",
    "raw_data = raw_data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process features and targets, categorical features are one-hot encoded\n",
    "raw_data['A16'] = raw_data['A16'].replace({'+':1, '-':0})\n",
    "cat_features = pd.get_dummies(raw_data[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']])\n",
    "ord_features = raw_data[['A11', 'A14', 'A15', 'A2', 'A3', 'A8']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# finalise training set\n",
    "train = pd.concat([cat_features, ord_features], axis=1).reset_index(drop=True)\n",
    "test = raw_data['A16'].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(train, test, random_state=42, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Benchmark 1: LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LR ROCAUC score: 0.82\n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression(random_state=42).fit(X_train, y_train)\n",
    "lr_pred = lr.predict(X_test)\n",
    "print(\"LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, lr_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Benchmark 2: GBDT (with GridSearch for best param)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'num_leaves': [20, 30, 40, 50, 60, 70],\n",
    "    'learning_rate': [0.05, 0.01],\n",
    "    'n_estimators': [100, 300, 500],\n",
    "    'subsample': [0.95],\n",
    "    'colsample_bytree': [0.95],\n",
    "    'n_jobs': [7],\n",
    "    'random_state': [42]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
      "[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   26.0s finished\n"
     ]
    }
   ],
   "source": [
    "gcv = GridSearchCV(LGBMClassifier(), params, cv=5, verbose=1).fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBDT ROCAUC score: 0.83\n"
     ]
    }
   ],
   "source": [
    "gbm = gcv.best_estimator_\n",
    "gbm_pred = gbm.predict(X_test)\n",
    "print(\"GBDT ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbm_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Main Dish: GBDT + LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Leaves: 20\n",
      "Number of Trees: 300\n"
     ]
    }
   ],
   "source": [
    "num_leaves = gbm.num_leaves\n",
    "num_trees = gbm.n_estimators\n",
    "print(\"Number of Leaves: {}\".format(num_leaves))\n",
    "print(\"Number of Trees: {}\".format(num_trees))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "gbm_int_train = gbm.predict(X_train, pred_leaf=True)\n",
    "gbm_int_test = gbm.predict(X_test, pred_leaf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[16  7  7  7  7  8 18 18  8  8  7  8  8 18  8  8  5  8  7 17  7  7 12 18\n",
      " 19 19 18 18 19 19 18 18 18 12 17 17 17 16 17 18 18 19 18 18 19 17 18 17\n",
      " 11 18 19 18 19 16 16 14 11 19 19 16 16 19 16 17 17 19 16 15 16 17 19 17\n",
      " 17 19 18 18 19 18 17 17 19 18 17 18 17 17 15 18 19 18 19 19 17 18 19 18\n",
      " 18 19 14 18 19 18 19 19 17 18 19 17 18 19 18 17 19 18 18 18 18 19 17 19\n",
      " 15 19 14 19 18 19 17 19 19 18 19 19 19 19 19 15 16 19 19 19 19 19 19 19\n",
      " 19 15 19 19 19 19 19 19 19 14 19 19 19 19 19 14 17 18 19 17 12 16 19 14\n",
      " 16 19 18 16 19 16 15 18 19 19 19 15 18 19 14 16 18 19 14 19 19 19 19 19\n",
      " 14 19 19 19 15 19 19 17 18 13 19 19 12 18 16 14 18 18 19 16 19 13 19 15\n",
      " 19 19 19 19 18 18 19 16 18 16 14 18 16 19 19 18 19 16 17 19 18 19 15 18\n",
      " 19 18 17 17 15 19 19 17 14 17 17 15 15 18 16 18 18 13 18 19 18 17 18 15\n",
      " 17  6 16 13 13 18 17 13 18 16 17 14  6 19 16 11 16 16 10 12 16 12 18  7\n",
      " 10 12 11  2  8 18 19  2 12 15  4 15]\n"
     ]
    }
   ],
   "source": [
    "print(gbm_int_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def form_onehot(int_df):\n",
    "    output_df = []\n",
    "    for i in int_df:\n",
    "        row_l = []\n",
    "        for j in i:\n",
    "            temp_l = np.zeros(num_leaves)\n",
    "            temp_l[j] = 1\n",
    "            row_l.extend(temp_l)\n",
    "        output_df.append(row_l)\n",
    "    return output_df           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 419 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "stack_train = form_onehot(gbm_int_train)\n",
    "stack_test = form_onehot(gbm_int_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RAM Taken by Train: 4272 bytes\n",
      "RAM Taken by Test: 1248 bytes\n"
     ]
    }
   ],
   "source": [
    "print(\"RAM Taken by Train: {} bytes\".format(sys.getsizeof(stack_train)))\n",
    "print(\"RAM Taken by Test: {} bytes\".format(sys.getsizeof(stack_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBDT-LR ROCAUC score: 0.86\n"
     ]
    }
   ],
   "source": [
    "gbdt_lr = LogisticRegression(penalty='l1', random_state=42).fit(stack_train, y_train)\n",
    "gbdt_lr_pred = gbdt_lr.predict(stack_test)\n",
    "print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_pred)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Practical concern: Data Sparsity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import vstack, csr_matrix\n",
    "\n",
    "def form_onehot_spar(int_df):\n",
    "    output_df = []\n",
    "    for i in int_df:\n",
    "        row_l = []\n",
    "        for j in i:\n",
    "            temp_l = np.zeros(num_leaves)\n",
    "            temp_l[j] = 1\n",
    "            row_l.extend(temp_l)\n",
    "        row_l = csr_matrix(row_l)\n",
    "        output_df.append(row_l)\n",
    "    return vstack(output_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 898 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "stack_train_spar = form_onehot_spar(gbm_int_train)\n",
    "stack_test_spar = form_onehot_spar(gbm_int_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RAM Taken by Sparse Train: 56 bytes\n",
      "RAM Taken by Sparse Test: 56 bytes\n"
     ]
    }
   ],
   "source": [
    "print(\"RAM Taken by Sparse Train: {} bytes\".format(sys.getsizeof(stack_train_spar)))\n",
    "print(\"RAM Taken by Sparse Test: {} bytes\".format(sys.getsizeof(stack_test_spar)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GBDT-LR ROCAUC score: 0.86\n"
     ]
    }
   ],
   "source": [
    "gbdt_lr_spar = LogisticRegression(penalty='l1', random_state=42).fit(stack_train_spar, y_train)\n",
    "gbdt_lr_spar_pred = gbdt_lr_spar.predict(stack_test_spar)\n",
    "print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_spar_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(653, 16)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import sys\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from lightgbm import LGBMClassifier\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.model_selection import train_test_split, GridSearchCV\n",
	"from sklearn.metrics import roc_auc_score\n",
	"\n",
	"import warnings\n",
	"warnings.filterwarnings('ignore')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"code_folding": []
	},
	"outputs": [],
	"source": [
	"# read in raw data\n",
	"raw_data = pd.read_csv('./data/crx.data', names=['A{}'.format(i) for i in range(1, 17)], na_values='?')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>A1</th>\n",
	" <th>A2</th>\n",
	" <th>A3</th>\n",
	" <th>A4</th>\n",
	" <th>A5</th>\n",
	" <th>A6</th>\n",
	" <th>A7</th>\n",
	" <th>A8</th>\n",
	" <th>A9</th>\n",
	" <th>A10</th>\n",
	" <th>A11</th>\n",
	" <th>A12</th>\n",
	" <th>A13</th>\n",
	" <th>A14</th>\n",
	" <th>A15</th>\n",
	" <th>A16</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>0</td>\n",
	" <td>b</td>\n",
	" <td>30.83</td>\n",
	" <td>0.000</td>\n",
	" <td>u</td>\n",
	" <td>g</td>\n",
	" <td>w</td>\n",
	" <td>v</td>\n",
	" <td>1.25</td>\n",
	" <td>t</td>\n",
	" <td>t</td>\n",
	" <td>1</td>\n",
	" <td>f</td>\n",
	" <td>g</td>\n",
	" <td>202.0</td>\n",
	" <td>0</td>\n",
	" <td>+</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>1</td>\n",
	" <td>a</td>\n",
	" <td>58.67</td>\n",
	" <td>4.460</td>\n",
	" <td>u</td>\n",
	" <td>g</td>\n",
	" <td>q</td>\n",
	" <td>h</td>\n",
	" <td>3.04</td>\n",
	" <td>t</td>\n",
	" <td>t</td>\n",
	" <td>6</td>\n",
	" <td>f</td>\n",
	" <td>g</td>\n",
	" <td>43.0</td>\n",
	" <td>560</td>\n",
	" <td>+</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>2</td>\n",
	" <td>a</td>\n",
	" <td>24.50</td>\n",
	" <td>0.500</td>\n",
	" <td>u</td>\n",
	" <td>g</td>\n",
	" <td>q</td>\n",
	" <td>h</td>\n",
	" <td>1.50</td>\n",
	" <td>t</td>\n",
	" <td>f</td>\n",
	" <td>0</td>\n",
	" <td>f</td>\n",
	" <td>g</td>\n",
	" <td>280.0</td>\n",
	" <td>824</td>\n",
	" <td>+</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>b</td>\n",
	" <td>27.83</td>\n",
	" <td>1.540</td>\n",
	" <td>u</td>\n",
	" <td>g</td>\n",
	" <td>w</td>\n",
	" <td>v</td>\n",
	" <td>3.75</td>\n",
	" <td>t</td>\n",
	" <td>t</td>\n",
	" <td>5</td>\n",
	" <td>t</td>\n",
	" <td>g</td>\n",
	" <td>100.0</td>\n",
	" <td>3</td>\n",
	" <td>+</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>b</td>\n",
	" <td>20.17</td>\n",
	" <td>5.625</td>\n",
	" <td>u</td>\n",
	" <td>g</td>\n",
	" <td>w</td>\n",
	" <td>v</td>\n",
	" <td>1.71</td>\n",
	" <td>t</td>\n",
	" <td>f</td>\n",
	" <td>0</td>\n",
	" <td>f</td>\n",
	" <td>s</td>\n",
	" <td>120.0</td>\n",
	" <td>0</td>\n",
	" <td>+</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16\n",
	"0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n",
	"1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n",
	"2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n",
	"3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n",
	"4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"raw_data.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# remove data with missing values\n",
	"raw_data = raw_data.dropna()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# process features and targets, categorical features are one-hot encoded\n",
	"raw_data['A16'] = raw_data['A16'].replace({'+':1, '-':0})\n",
	"cat_features = pd.get_dummies(raw_data[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']])\n",
	"ord_features = raw_data[['A11', 'A14', 'A15', 'A2', 'A3', 'A8']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# finalise training set\n",
	"train = pd.concat([cat_features, ord_features], axis=1).reset_index(drop=True)\n",
	"test = raw_data['A16'].reset_index(drop=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# train test split\n",
	"X_train, X_test, y_train, y_test = train_test_split(train, test, random_state=42, test_size = 0.2)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Benchmark 1: LR"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"LR ROCAUC score: 0.82\n"
	]
	}
	],
	"source": [
	"lr = LogisticRegression(random_state=42).fit(X_train, y_train)\n",
	"lr_pred = lr.predict(X_test)\n",
	"print(\"LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, lr_pred)))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Benchmark 2: GBDT (with GridSearch for best param)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {\n",
	" 'num_leaves': [20, 30, 40, 50, 60, 70],\n",
	" 'learning_rate': [0.05, 0.01],\n",
	" 'n_estimators': [100, 300, 500],\n",
	" 'subsample': [0.95],\n",
	" 'colsample_bytree': [0.95],\n",
	" 'n_jobs': [7],\n",
	" 'random_state': [42]\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fitting 5 folds for each of 36 candidates, totalling 180 fits\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
	"[Parallel(n_jobs=1)]: Done 180 out of 180 \| elapsed: 26.0s finished\n"
	]
	}
	],
	"source": [
	"gcv = GridSearchCV(LGBMClassifier(), params, cv=5, verbose=1).fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"GBDT ROCAUC score: 0.83\n"
	]
	}
	],
	"source": [
	"gbm = gcv.best_estimator_\n",
	"gbm_pred = gbm.predict(X_test)\n",
	"print(\"GBDT ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbm_pred)))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Main Dish: GBDT + LR"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Number of Leaves: 20\n",
	"Number of Trees: 300\n"
	]
	}
	],
	"source": [
	"num_leaves = gbm.num_leaves\n",
	"num_trees = gbm.n_estimators\n",
	"print(\"Number of Leaves: {}\".format(num_leaves))\n",
	"print(\"Number of Trees: {}\".format(num_trees))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"gbm_int_train = gbm.predict(X_train, pred_leaf=True)\n",
	"gbm_int_test = gbm.predict(X_test, pred_leaf=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[16 7 7 7 7 8 18 18 8 8 7 8 8 18 8 8 5 8 7 17 7 7 12 18\n",
	" 19 19 18 18 19 19 18 18 18 12 17 17 17 16 17 18 18 19 18 18 19 17 18 17\n",
	" 11 18 19 18 19 16 16 14 11 19 19 16 16 19 16 17 17 19 16 15 16 17 19 17\n",
	" 17 19 18 18 19 18 17 17 19 18 17 18 17 17 15 18 19 18 19 19 17 18 19 18\n",
	" 18 19 14 18 19 18 19 19 17 18 19 17 18 19 18 17 19 18 18 18 18 19 17 19\n",
	" 15 19 14 19 18 19 17 19 19 18 19 19 19 19 19 15 16 19 19 19 19 19 19 19\n",
	" 19 15 19 19 19 19 19 19 19 14 19 19 19 19 19 14 17 18 19 17 12 16 19 14\n",
	" 16 19 18 16 19 16 15 18 19 19 19 15 18 19 14 16 18 19 14 19 19 19 19 19\n",
	" 14 19 19 19 15 19 19 17 18 13 19 19 12 18 16 14 18 18 19 16 19 13 19 15\n",
	" 19 19 19 19 18 18 19 16 18 16 14 18 16 19 19 18 19 16 17 19 18 19 15 18\n",
	" 19 18 17 17 15 19 19 17 14 17 17 15 15 18 16 18 18 13 18 19 18 17 18 15\n",
	" 17 6 16 13 13 18 17 13 18 16 17 14 6 19 16 11 16 16 10 12 16 12 18 7\n",
	" 10 12 11 2 8 18 19 2 12 15 4 15]\n"
	]
	}
	],
	"source": [
	"print(gbm_int_train[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"def form_onehot(int_df):\n",
	" output_df = []\n",
	" for i in int_df:\n",
	" row_l = []\n",
	" for j in i:\n",
	" temp_l = np.zeros(num_leaves)\n",
	" temp_l[j] = 1\n",
	" row_l.extend(temp_l)\n",
	" output_df.append(row_l)\n",
	" return output_df "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wall time: 419 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"stack_train = form_onehot(gbm_int_train)\n",
	"stack_test = form_onehot(gbm_int_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"RAM Taken by Train: 4272 bytes\n",
	"RAM Taken by Test: 1248 bytes\n"
	]
	}
	],
	"source": [
	"print(\"RAM Taken by Train: {} bytes\".format(sys.getsizeof(stack_train)))\n",
	"print(\"RAM Taken by Test: {} bytes\".format(sys.getsizeof(stack_test)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"GBDT-LR ROCAUC score: 0.86\n"
	]
	}
	],
	"source": [
	"gbdt_lr = LogisticRegression(penalty='l1', random_state=42).fit(stack_train, y_train)\n",
	"gbdt_lr_pred = gbdt_lr.predict(stack_test)\n",
	"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_pred)))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Practical concern: Data Sparsity"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"from scipy.sparse import vstack, csr_matrix\n",
	"\n",
	"def form_onehot_spar(int_df):\n",
	" output_df = []\n",
	" for i in int_df:\n",
	" row_l = []\n",
	" for j in i:\n",
	" temp_l = np.zeros(num_leaves)\n",
	" temp_l[j] = 1\n",
	" row_l.extend(temp_l)\n",
	" row_l = csr_matrix(row_l)\n",
	" output_df.append(row_l)\n",
	" return vstack(output_df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wall time: 898 ms\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"stack_train_spar = form_onehot_spar(gbm_int_train)\n",
	"stack_test_spar = form_onehot_spar(gbm_int_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"RAM Taken by Sparse Train: 56 bytes\n",
	"RAM Taken by Sparse Test: 56 bytes\n"
	]
	}
	],
	"source": [
	"print(\"RAM Taken by Sparse Train: {} bytes\".format(sys.getsizeof(stack_train_spar)))\n",
	"print(\"RAM Taken by Sparse Test: {} bytes\".format(sys.getsizeof(stack_test_spar)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"GBDT-LR ROCAUC score: 0.86\n"
	]
	}
	],
	"source": [
	"gbdt_lr_spar = LogisticRegression(penalty='l1', random_state=42).fit(stack_train_spar, y_train)\n",
	"gbdt_lr_spar_pred = gbdt_lr_spar.predict(stack_test_spar)\n",
	"print(\"GBDT-LR ROCAUC score: {:.2f}\".format(roc_auc_score(y_test, gbdt_lr_spar_pred)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(653, 16)"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"raw_data.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}